From 78bc1b64a6dc3fb6191355a5e1b502be8b3668e7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 14 Jul 2024 08:36:33 +0400 Subject: [PATCH] AMDGPU: Move attributor into optimization pipeline (#83131) Removing it from the codegen pipeline induces a lot of test churn because llc is no longer optimizing out implicit arguments to kernels. Mostly mechanical, but there are some creative test updates. I preferred to take the changes as-is in tests where the ABI isn't relevant. In cases where it's more relevant, or the optimize out logic was too ingrained in the test, I pre-run the optimization. Some cases manually add attributes to disable inputs. --- clang/test/CodeGenHIP/default-attributes.hip | 29 +- llvm/docs/ReleaseNotes.rst | 4 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 13 +- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 6 + .../CodeGen/AMDGPU/GlobalISel/addsubu64.ll | 16 +- .../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 152 +- .../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 152 +- .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 537 +- .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 575 +- .../AMDGPU/GlobalISel/bool-legalization.ll | 8 +- .../GlobalISel/call-outgoing-stack-args.ll | 20 +- .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 76 +- .../GlobalISel/divergent-control-flow.ll | 2 +- .../GlobalISel/dynamic-alloca-uniform.ll | 30 +- .../AMDGPU/GlobalISel/extractelement.ll | 276 +- .../AMDGPU/GlobalISel/flat-scratch-init.ll | 8 +- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 192 +- .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 80 +- .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 845 +- llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 112 +- .../AMDGPU/GlobalISel/function-returns.ll | 4 +- ...licit-kernarg-backend-usage-global-isel.ll | 40 +- .../GlobalISel/inline-asm-mismatched-size.ll | 3 + .../GlobalISel/insertelement-stack-lower.ll | 6 +- .../AMDGPU/GlobalISel/insertelement.large.ll | 8 +- ...irtranslator-amdgpu_kernel-system-sgprs.ll | 2 +- .../GlobalISel/irtranslator-amdgpu_kernel.ll | 472 +- .../AMDGPU/GlobalISel/irtranslator-fence.ll | 120 + .../GlobalISel/irtranslator-sibling-call.ll | 269 +- .../AMDGPU/GlobalISel/lds-global-value.ll | 2 +- .../AMDGPU/GlobalISel/lds-zero-initializer.ll | 38 +- .../GlobalISel/llvm.amdgcn.div.scale.ll | 528 +- .../GlobalISel/llvm.amdgcn.end.cf.i32.ll | 10 +- .../GlobalISel/llvm.amdgcn.end.cf.i64.ll | 4 +- .../llvm.amdgcn.global.atomic.csub.ll | 24 +- .../llvm.amdgcn.global.atomic.fadd.ll | 8 +- .../GlobalISel/llvm.amdgcn.if.break.i32.ll | 18 +- .../GlobalISel/llvm.amdgcn.if.break.i64.ll | 4 +- .../GlobalISel/llvm.amdgcn.intersect_ray.ll | 113 +- .../GlobalISel/llvm.amdgcn.is.private.ll | 23 +- .../GlobalISel/llvm.amdgcn.is.shared.ll | 23 +- .../llvm.amdgcn.kernarg.segment.ptr.ll | 7 +- .../GlobalISel/llvm.amdgcn.mfma.gfx90a.ll | 34 +- .../AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll | 24 +- .../GlobalISel/llvm.amdgcn.queue.ptr.ll | 15 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll | 98 +- .../GlobalISel/llvm.amdgcn.set.inactive.ll | 86 +- .../GlobalISel/llvm.amdgcn.trig.preop.ll | 32 +- .../AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll | 126 +- .../GlobalISel/llvm.amdgcn.update.dpp.ll | 83 +- .../GlobalISel/llvm.amdgcn.workgroup.id.ll | 9 +- .../GlobalISel/llvm.amdgcn.workitem.id.ll | 20 +- .../CodeGen/AMDGPU/GlobalISel/localizer.ll | 4 +- .../memory-legalizer-atomic-fence.ll | 82 +- .../AMDGPU/GlobalISel/mul-known-bits.i64.ll | 114 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 24 +- .../AMDGPU/GlobalISel/non-entry-alloca.ll | 46 +- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 738 +- .../AMDGPU/GlobalISel/shl-ext-reduce.ll | 35 +- .../AMDGPU/GlobalISel/store-local.128.ll | 172 +- .../AMDGPU/GlobalISel/store-local.96.ll | 172 +- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 486 +- .../AMDGPU/GlobalISel/vni8-across-blocks.ll | 196 +- .../GlobalISel/widen-i8-i16-scalar-loads.ll | 60 +- llvm/test/CodeGen/AMDGPU/add.ll | 327 +- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 224 +- llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 6 +- .../AMDGPU/agpr-copy-no-free-registers.ll | 129 +- .../CodeGen/AMDGPU/agpr-register-count.ll | 2 +- llvm/test/CodeGen/AMDGPU/always-uniform.ll | 2 +- llvm/test/CodeGen/AMDGPU/amd.endpgm.ll | 34 +- ...amdgpu-codegenprepare-fold-binop-select.ll | 2 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 2877 +- .../CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll | 2 +- .../AMDGPU/amdgpu-simplify-libcall-sincos.ll | 48 +- .../CodeGen/AMDGPU/amdgpu.private-memory.ll | 6 +- .../amdgpu.work-item-intrinsics.deprecated.ll | 36 +- llvm/test/CodeGen/AMDGPU/amdpal-elf.ll | 2 +- llvm/test/CodeGen/AMDGPU/anyext.ll | 38 +- .../AMDGPU/atomic_optimizations_buffer.ll | 1510 +- .../atomic_optimizations_global_pointer.ll | 1293 +- .../atomic_optimizations_local_pointer.ll | 2219 +- .../AMDGPU/atomic_optimizations_raw_buffer.ll | 1206 +- .../atomic_optimizations_struct_buffer.ll | 1348 +- llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 48 +- .../AMDGPU/attr-amdgpu-waves-per-eu.ll | 4 +- llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 6 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 576 +- llvm/test/CodeGen/AMDGPU/bfe-combine.ll | 36 +- llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 56 +- llvm/test/CodeGen/AMDGPU/bfi_int.ll | 236 +- llvm/test/CodeGen/AMDGPU/bfi_nested.ll | 2 +- llvm/test/CodeGen/AMDGPU/bfm.ll | 16 +- llvm/test/CodeGen/AMDGPU/bitreverse.ll | 162 +- llvm/test/CodeGen/AMDGPU/br_cc.f16.ll | 64 +- .../test/CodeGen/AMDGPU/branch-relax-spill.ll | 4 +- llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 92 +- llvm/test/CodeGen/AMDGPU/bswap.ll | 42 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 13715 ++++------ .../buffer-fat-pointer-atomicrmw-fmax.ll | 4612 ++-- .../buffer-fat-pointer-atomicrmw-fmin.ll | 4612 ++-- .../CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll | 28 +- llvm/test/CodeGen/AMDGPU/build_vector.ll | 74 +- llvm/test/CodeGen/AMDGPU/call-constexpr.ll | 5 +- .../AMDGPU/call-graph-register-usage.ll | 4 +- .../CodeGen/AMDGPU/call-reqd-group-size.ll | 36 +- .../callee-special-input-sgprs-fixed-abi.ll | 6 +- .../callee-special-input-vgprs-packed.ll | 6 +- .../AMDGPU/callee-special-input-vgprs.ll | 4 +- .../CodeGen/AMDGPU/calling-conventions.ll | 212 +- .../test/CodeGen/AMDGPU/carryout-selection.ll | 776 +- llvm/test/CodeGen/AMDGPU/cc-update.ll | 18 +- .../CodeGen/AMDGPU/cf-loop-on-constant.ll | 14 +- .../AMDGPU/cgp-addressing-modes-gfx1030.ll | 2 +- .../AMDGPU/cgp-addressing-modes-gfx908.ll | 2 +- .../CodeGen/AMDGPU/cgp-bitfield-extract.ll | 2 +- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 26 +- llvm/test/CodeGen/AMDGPU/clamp-modifier.ll | 234 +- llvm/test/CodeGen/AMDGPU/clamp.ll | 727 +- llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 12 +- .../CodeGen/AMDGPU/coalesce-vgpr-alignment.ll | 6 +- llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 4 +- .../AMDGPU/codegen-internal-only-func.ll | 27 +- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 14 +- .../CodeGen/AMDGPU/combine-cond-add-sub.ll | 226 +- .../CodeGen/AMDGPU/combine-reg-or-const.ll | 4 +- .../CodeGen/AMDGPU/combine-vload-extract.ll | 4 +- llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll | 150 +- .../CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll | 32 +- llvm/test/CodeGen/AMDGPU/copy_to_scc.ll | 6 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 270 +- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 272 +- llvm/test/CodeGen/AMDGPU/ctpop16.ll | 88 +- llvm/test/CodeGen/AMDGPU/ctpop64.ll | 124 +- llvm/test/CodeGen/AMDGPU/cttz.ll | 168 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 166 +- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 251 +- .../CodeGen/AMDGPU/dag-divergence-atomic.ll | 201 +- ...dagcomb-extract-vec-elt-different-sizes.ll | 6 +- .../CodeGen/AMDGPU/dagcombine-setcc-select.ll | 8 +- .../AMDGPU/divergence-driven-buildvector.ll | 152 +- .../AMDGPU/divergence-driven-sext-inreg.ll | 8 +- .../AMDGPU/divergence-driven-trunc-to-i1.ll | 12 +- llvm/test/CodeGen/AMDGPU/ds-alignment.ll | 90 +- .../CodeGen/AMDGPU/ds-combine-large-stride.ll | 26 +- .../AMDGPU/ds-combine-with-dependence.ll | 10 +- llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll | 80 +- llvm/test/CodeGen/AMDGPU/ds_read2.ll | 243 +- llvm/test/CodeGen/AMDGPU/ds_write2.ll | 150 +- llvm/test/CodeGen/AMDGPU/early-inline.ll | 1 + llvm/test/CodeGen/AMDGPU/elf-notes.ll | 4 +- ...cannot-create-empty-or-backward-segment.ll | 10 +- .../expand-scalar-carry-out-select-user.ll | 8 +- .../CodeGen/AMDGPU/extract_vector_dynelt.ll | 10 +- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 213 +- .../CodeGen/AMDGPU/extract_vector_elt-i16.ll | 8 +- .../CodeGen/AMDGPU/extract_vector_elt-i8.ll | 12 +- .../CodeGen/AMDGPU/extractelt-to-trunc.ll | 20 +- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 167 +- llvm/test/CodeGen/AMDGPU/fabs.ll | 112 +- llvm/test/CodeGen/AMDGPU/fadd.f16.ll | 132 +- .../fast-unaligned-load-store.global.ll | 36 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 469 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 512 +- llvm/test/CodeGen/AMDGPU/fcmp.f16.ll | 932 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 601 +- llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 337 +- llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll | 437 +- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 282 +- llvm/test/CodeGen/AMDGPU/fdiv.ll | 290 +- .../CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll | 92 +- llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll | 52 +- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 424 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 714 +- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 3306 +-- .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 258 +- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 1104 +- .../CodeGen/AMDGPU/flat_atomics_i64_system.ll | 98 +- llvm/test/CodeGen/AMDGPU/fma-combine.ll | 788 +- llvm/test/CodeGen/AMDGPU/fma.ll | 8 +- llvm/test/CodeGen/AMDGPU/fmax3.ll | 32 +- llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll | 16 +- llvm/test/CodeGen/AMDGPU/fmaximum.ll | 8 +- llvm/test/CodeGen/AMDGPU/fmed3.ll | 818 +- llvm/test/CodeGen/AMDGPU/fmin3.ll | 48 +- llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll | 32 +- llvm/test/CodeGen/AMDGPU/fminimum.ll | 8 +- .../AMDGPU/fmul-2-combine-multi-use.ll | 324 +- llvm/test/CodeGen/AMDGPU/fmul.f16.ll | 244 +- llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 356 +- llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 123 +- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 72 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 178 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll | 58 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 48 +- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 28 +- llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 128 +- llvm/test/CodeGen/AMDGPU/fneg.ll | 218 +- .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 122 +- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 342 +- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 322 +- .../AMDGPU/fp-min-max-buffer-atomics.ll | 247 +- .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll | 233 +- llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll | 6 +- llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll | 6 +- llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll | 6 +- .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 1002 +- .../AMDGPU/fp64-min-max-buffer-atomics.ll | 128 +- .../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll | 128 +- llvm/test/CodeGen/AMDGPU/fp_to_sint.ll | 118 +- llvm/test/CodeGen/AMDGPU/fp_to_uint.ll | 104 +- llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 128 +- llvm/test/CodeGen/AMDGPU/fptosi.f16.ll | 50 +- llvm/test/CodeGen/AMDGPU/fptoui.f16.ll | 55 +- llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 160 +- llvm/test/CodeGen/AMDGPU/fptrunc.ll | 190 +- llvm/test/CodeGen/AMDGPU/frem.ll | 448 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 344 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 208 +- llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll | 181 +- llvm/test/CodeGen/AMDGPU/fsub.f16.ll | 156 +- .../CodeGen/AMDGPU/function-args-inreg.ll | 1349 +- llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll | 24 +- llvm/test/CodeGen/AMDGPU/gds-allocation.ll | 2 +- .../CodeGen/AMDGPU/gep-const-address-space.ll | 8 +- .../AMDGPU/gfx11-user-sgpr-init16-bug.ll | 31 +- .../global-atomicrmw-fadd-wrong-subtarget.ll | 36 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 21711 ++++++---------- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 4592 ++-- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 4592 ++-- .../global-atomics-fp-wrong-subtarget.ll | 16 +- llvm/test/CodeGen/AMDGPU/global-constant.ll | 4 +- .../CodeGen/AMDGPU/global-i16-load-store.ll | 24 +- .../AMDGPU/global-load-saddr-to-vaddr.ll | 4 +- llvm/test/CodeGen/AMDGPU/global_atomics.ll | 2702 +- .../AMDGPU/global_atomics_i32_system.ll | 244 +- .../test/CodeGen/AMDGPU/global_atomics_i64.ll | 1624 +- .../AMDGPU/global_atomics_i64_system.ll | 106 +- .../AMDGPU/global_atomics_scan_fadd.ll | 6055 +++-- .../AMDGPU/global_atomics_scan_fmax.ll | 4234 ++- .../AMDGPU/global_atomics_scan_fmin.ll | 4234 ++- .../AMDGPU/global_atomics_scan_fsub.ll | 5663 ++-- llvm/test/CodeGen/AMDGPU/global_smrd.ll | 4 +- llvm/test/CodeGen/AMDGPU/half.ll | 366 +- .../hsa-metadata-agpr-register-count.ll | 2 +- .../CodeGen/AMDGPU/hsa-metadata-heap-v5.ll | 5 +- .../AMDGPU/hsa-metadata-hostcall-v4.ll | 5 +- .../AMDGPU/hsa-metadata-hostcall-v5.ll | 5 +- .../AMDGPU/hsa-metadata-kernel-code-props.ll | 58 +- .../hsa-metadata-multigrid-sync-arg-v5.ll | 5 +- .../AMDGPU/hsa-metadata-queue-ptr-v5.ll | 16 +- .../AMDGPU/hsa-metadata-queueptr-v5.ll | 5 +- ...tadata-resource-usage-function-ordering.ll | 5 +- llvm/test/CodeGen/AMDGPU/hsa.ll | 6 +- llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 434 +- llvm/test/CodeGen/AMDGPU/idot2.ll | 675 +- llvm/test/CodeGen/AMDGPU/idot4s.ll | 734 +- llvm/test/CodeGen/AMDGPU/idot4u.ll | 1374 +- llvm/test/CodeGen/AMDGPU/idot8s.ll | 722 +- llvm/test/CodeGen/AMDGPU/idot8u.ll | 925 +- llvm/test/CodeGen/AMDGPU/imm.ll | 500 +- llvm/test/CodeGen/AMDGPU/imm16.ll | 530 +- llvm/test/CodeGen/AMDGPU/immv216.ll | 2 +- .../AMDGPU/implicit-kernarg-backend-usage.ll | 40 +- .../CodeGen/AMDGPU/implicitarg-attributes.ll | 2 +- .../AMDGPU/indirect-call-known-callees.ll | 82 +- llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 16 +- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 24 +- llvm/test/CodeGen/AMDGPU/inline-attr.ll | 17 +- llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll | 2 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 734 +- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 852 +- .../AMDGPU/insert_vector_elt.v2bf16.ll | 567 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 627 +- .../insert_waitcnt_for_precise_memory.ll | 260 +- llvm/test/CodeGen/AMDGPU/ipra.ll | 8 +- llvm/test/CodeGen/AMDGPU/kernarg-size.ll | 6 +- llvm/test/CodeGen/AMDGPU/kernel-args.ll | 933 +- .../AMDGPU/kernel-argument-dag-lowering.ll | 98 +- .../test/CodeGen/AMDGPU/kill-infinite-loop.ll | 6 +- .../CodeGen/AMDGPU/large-alloca-compute.ll | 14 +- llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 432 +- .../CodeGen/AMDGPU/lds-zero-initializer.ll | 28 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 12 - .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 24 +- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll | 2 +- .../AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll | 2 +- .../AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 192 +- .../CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll | 37 +- ...llvm.amdgcn.ds.gws.barrier-fastregalloc.ll | 2 + .../CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll | 9 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll | 1112 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 1068 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 27 +- .../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll | 6 +- .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 4 +- .../CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll | 2 +- ...vm.amdgcn.global.atomic.ordered.add.b64.ll | 17 +- .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 4 +- .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 4 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll | 732 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 698 +- .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 8 +- .../AMDGPU/llvm.amdgcn.implicitarg.ptr.ll | 6 +- .../AMDGPU/llvm.amdgcn.intersect_ray.ll | 74 +- .../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll | 8 +- .../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 8 +- .../AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll | 9 +- .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 28 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 4512 ++-- .../AMDGPU/llvm.amdgcn.permlane16.var.ll | 224 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 424 +- .../AMDGPU/llvm.amdgcn.permlane64.ptr.ll | 46 +- .../CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll | 14 +- .../llvm.amdgcn.raw.buffer.atomic.fadd.ll | 30 +- ...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 8 +- ...amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll | 80 +- ...m.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll | 50 +- .../llvm.amdgcn.raw.ptr.buffer.load.bf16.ll | 96 +- .../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll | 312 +- .../llvm.amdgcn.raw.ptr.buffer.store.ll | 336 +- .../llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll | 92 +- .../llvm.amdgcn.raw.tbuffer.store.d16.ll | 112 +- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 32 +- .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 76 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 333 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 333 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 117 +- .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 363 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll | 2 +- .../llvm.amdgcn.sched.group.barrier.gfx11.ll | 28 +- .../llvm.amdgcn.sched.group.barrier.gfx12.ll | 142 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 276 +- .../CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll | 24 +- .../AMDGPU/llvm.amdgcn.set.inactive.ll | 68 +- .../llvm.amdgcn.struct.buffer.atomic.fadd.ll | 24 +- ...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll | 4 +- ...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 64 +- ...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 40 +- ...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 210 +- ...mdgcn.struct.ptr.buffer.atomic.fmax.f64.ll | 116 +- ...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 210 +- ...mdgcn.struct.ptr.buffer.atomic.fmin.f64.ll | 116 +- ...lvm.amdgcn.struct.ptr.tbuffer.store.d16.ll | 156 +- .../llvm.amdgcn.struct.tbuffer.store.d16.ll | 176 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll | 280 +- .../AMDGPU/llvm.amdgcn.workgroup.id.ll | 9 +- .../CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll | 13 +- .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 600 +- llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll | 16 +- llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 20 +- llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 194 +- llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 194 +- llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 139 +- llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll | 16 +- llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 292 +- llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll | 12 +- .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 36 +- .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 38 +- llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll | 54 +- llvm/test/CodeGen/AMDGPU/llvm.log.ll | 254 +- llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 254 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 280 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 70 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 66 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 108 +- llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 300 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 58 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 66 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 108 +- llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 300 +- llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 20 +- .../AMDGPU/llvm.r600.read.local.size.ll | 146 +- llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll | 14 +- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 651 +- llvm/test/CodeGen/AMDGPU/llvm.round.ll | 484 +- llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 20 +- llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/load-constant-f32.ll | 4 +- llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 16 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 264 +- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 312 +- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 250 +- llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 48 +- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 416 +- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 270 +- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 228 +- llvm/test/CodeGen/AMDGPU/local-64.ll | 34 +- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 741 +- .../CodeGen/AMDGPU/local-memory.amdgcn.ll | 6 +- .../local-stack-alloc-block-sp-reference.ll | 86 +- .../AMDGPU/long-branch-reserve-register.ll | 66 +- .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 14 +- llvm/test/CodeGen/AMDGPU/loop_break.ll | 12 +- .../AMDGPU/lower-lds-struct-aa-memcpy.ll | 2 +- .../CodeGen/AMDGPU/lower-lds-struct-aa.ll | 16 +- .../AMDGPU/lower-module-lds-via-hybrid.ll | 102 +- .../AMDGPU/lower-module-lds-via-table.ll | 90 +- .../lower-work-group-id-intrinsics-hsa.ll | 189 +- llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll | 20 +- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 156 +- llvm/test/CodeGen/AMDGPU/mad.u16.ll | 12 +- .../CodeGen/AMDGPU/mad24-get-global-id.ll | 2 +- llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 46 +- llvm/test/CodeGen/AMDGPU/madak.ll | 516 +- .../match-perm-extract-vector-elt-bug.ll | 40 +- .../CodeGen/AMDGPU/max-hard-clause-length.ll | 12 +- llvm/test/CodeGen/AMDGPU/max.i16.ll | 82 +- llvm/test/CodeGen/AMDGPU/max.ll | 228 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 1096 +- .../test/CodeGen/AMDGPU/memcpy-scalar-load.ll | 8 +- .../CodeGen/AMDGPU/memmove-scalar-load.ll | 8 +- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 70 +- .../AMDGPU/mfma-bf16-vgpr-cd-select.ll | 30 +- llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll | 15 +- .../AMDGPU/mfma-vgpr-cd-select-gfx940.ll | 54 +- .../CodeGen/AMDGPU/mfma-vgpr-cd-select.ll | 28 +- llvm/test/CodeGen/AMDGPU/min.ll | 830 +- .../AMDGPU/module-lds-false-sharing.ll | 160 +- .../CodeGen/AMDGPU/move-to-valu-addsubu64.ll | 20 +- .../AMDGPU/move-to-valu-atomicrmw-system.ll | 12 +- .../CodeGen/AMDGPU/move-to-valu-atomicrmw.ll | 12 +- .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll | 16 +- .../move-to-valu-pseudo-scalar-trans.ll | 40 +- .../CodeGen/AMDGPU/mubuf-offset-private.ll | 34 +- llvm/test/CodeGen/AMDGPU/mul.ll | 888 +- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 242 +- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 292 +- llvm/test/CodeGen/AMDGPU/multilevel-break.ll | 2 +- .../CodeGen/AMDGPU/nested-loop-conditions.ll | 4 +- llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 93 +- llvm/test/CodeGen/AMDGPU/offset-split-flat.ll | 308 +- .../CodeGen/AMDGPU/offset-split-global.ll | 264 +- llvm/test/CodeGen/AMDGPU/omod.ll | 56 +- llvm/test/CodeGen/AMDGPU/optimize-compare.ll | 32 +- .../CodeGen/AMDGPU/optimize-negated-cond.ll | 4 +- llvm/test/CodeGen/AMDGPU/or.ll | 376 +- llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 60 +- llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 54 +- llvm/test/CodeGen/AMDGPU/packed-op-sel.ll | 2 +- ...al-regcopy-and-spill-missed-at-regalloc.ll | 2 +- .../CodeGen/AMDGPU/partial-shift-shrink.ll | 2 +- llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll | 4 +- llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll | 45 +- llvm/test/CodeGen/AMDGPU/permute.ll | 148 +- llvm/test/CodeGen/AMDGPU/permute_i8.ll | 40 +- .../AMDGPU/post-ra-soft-clause-dbg-info.ll | 2 +- .../CodeGen/AMDGPU/preload-kernarg-header.ll | 2 +- llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 3080 +-- .../CodeGen/AMDGPU/private-memory-atomics.ll | 8 +- .../AMDGPU/promote-constOffset-to-imm.ll | 260 +- .../AMDGPU/ptr-buffer-alias-scheduling.ll | 16 +- llvm/test/CodeGen/AMDGPU/rcp-pattern.ll | 148 +- .../AMDGPU/reassoc-mul-add-1-to-mad.ll | 183 +- ...emove-incompatible-extended-image-insts.ll | 5 +- .../AMDGPU/remove-incompatible-functions.ll | 156 +- .../CodeGen/AMDGPU/remove-incompatible-gws.ll | 7 +- .../AMDGPU/remove-incompatible-s-time.ll | 11 +- llvm/test/CodeGen/AMDGPU/rotl.ll | 60 +- llvm/test/CodeGen/AMDGPU/rotr.ll | 44 +- llvm/test/CodeGen/AMDGPU/rsq.f32.ll | 87 +- llvm/test/CodeGen/AMDGPU/s_addk_i32.ll | 20 +- llvm/test/CodeGen/AMDGPU/sad.ll | 136 +- llvm/test/CodeGen/AMDGPU/saddo.ll | 162 +- llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 16 +- .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 22 +- .../scc-clobbered-sgpr-to-vmem-spill.ll | 2 +- llvm/test/CodeGen/AMDGPU/sdiv.ll | 72 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 224 +- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 380 +- .../CodeGen/AMDGPU/select-constant-cttz.ll | 2 +- llvm/test/CodeGen/AMDGPU/select.f16.ll | 184 +- llvm/test/CodeGen/AMDGPU/setcc.ll | 2 +- .../AMDGPU/sext-divergence-driven-isel.ll | 26 +- llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll | 42 +- .../CodeGen/AMDGPU/sgpr-copy-local-cse.ll | 2 +- .../CodeGen/AMDGPU/shift-and-i128-ubfe.ll | 2 +- .../test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll | 3 +- llvm/test/CodeGen/AMDGPU/shift-i128.ll | 12 +- llvm/test/CodeGen/AMDGPU/shl.ll | 408 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 154 +- .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 562 +- llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll | 72 +- .../AMDGPU/si-annotate-cfg-loop-assert.ll | 2 +- .../si-unify-exit-multiple-unreachables.ll | 17 +- llvm/test/CodeGen/AMDGPU/sibling-call.ll | 2 +- llvm/test/CodeGen/AMDGPU/sign_extend.ll | 100 +- .../CodeGen/AMDGPU/simple-indirect-call.ll | 33 +- llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll | 2 +- llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 64 +- llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll | 260 +- llvm/test/CodeGen/AMDGPU/sitofp.f16.ll | 76 +- llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll | 6 +- llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 2 +- llvm/test/CodeGen/AMDGPU/sopk-compares.ll | 4 +- llvm/test/CodeGen/AMDGPU/spill-agpr.ll | 8 +- .../AMDGPU/spill-offset-calculation.ll | 30 +- .../CodeGen/AMDGPU/spill-scavenge-offset.ll | 48 +- .../AMDGPU/spill-sgpr-stack-no-sgpr.ll | 15 +- .../CodeGen/AMDGPU/spill-vector-superclass.ll | 10 +- .../CodeGen/AMDGPU/spill-writelane-vgprs.ll | 2 +- llvm/test/CodeGen/AMDGPU/sra.ll | 68 +- llvm/test/CodeGen/AMDGPU/srem.ll | 84 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 140 +- llvm/test/CodeGen/AMDGPU/srl.ll | 32 +- ...tack-pointer-offset-relative-frameindex.ll | 18 +- .../CodeGen/AMDGPU/stack-realign-kernel.ll | 132 +- .../CodeGen/AMDGPU/stacksave_stackrestore.ll | 186 +- llvm/test/CodeGen/AMDGPU/store-local.128.ll | 144 +- llvm/test/CodeGen/AMDGPU/store-local.96.ll | 144 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 98 +- llvm/test/CodeGen/AMDGPU/sub.ll | 192 +- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 204 +- .../AMDGPU/subreg-coalescer-undef-use.ll | 3 +- llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 4 +- llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 4 +- llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 4 +- llvm/test/CodeGen/AMDGPU/trap-abis.ll | 46 +- llvm/test/CodeGen/AMDGPU/trap.ll | 14 +- llvm/test/CodeGen/AMDGPU/trunc-combine.ll | 8 +- llvm/test/CodeGen/AMDGPU/trunc-store.ll | 92 +- llvm/test/CodeGen/AMDGPU/trunc.ll | 2 +- llvm/test/CodeGen/AMDGPU/uaddo.ll | 142 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 112 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 120 +- llvm/test/CodeGen/AMDGPU/udivrem.ll | 302 +- llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 90 +- llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll | 243 +- llvm/test/CodeGen/AMDGPU/uitofp.f16.ll | 76 +- llvm/test/CodeGen/AMDGPU/uniform-cfg.ll | 290 +- llvm/test/CodeGen/AMDGPU/uniform-select.ll | 8 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 364 +- llvm/test/CodeGen/AMDGPU/usubo.ll | 142 +- .../CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll | 2 +- llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll | 4 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 740 +- llvm/test/CodeGen/AMDGPU/v_madak_f16.ll | 124 +- llvm/test/CodeGen/AMDGPU/v_pack.ll | 20 +- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 106 +- .../CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll | 2 +- .../CodeGen/AMDGPU/vector-extract-insert.ll | 41 +- .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 90 +- llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 18 +- .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 148 +- .../CodeGen/AMDGPU/waterfall_kills_scc.ll | 30 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 313 +- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 74 +- .../AMDGPU/workgroup-id-in-arch-sgprs.ll | 38 +- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 266 +- llvm/test/CodeGen/AMDGPU/xor.ll | 276 +- .../AMDGPU/zext-divergence-driven-isel.ll | 16 +- .../AMDGPU/long-branch-reg-all-sgpr-used.ll | 2 + .../AMDGPU/machine-function-info-after-pei.ll | 2 +- .../machine-function-info-long-branch-reg.ll | 4 +- .../MIR/AMDGPU/machine-function-info.ll | 12 +- .../InferAddressSpaces/AMDGPU/flat_atomic.ll | 50 +- .../Inputs/amdgpu_isel.ll.expected | 10 +- 562 files changed, 86435 insertions(+), 90178 deletions(-) diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip index 63572bfd242b9..107ef6b94c4de 100644 --- a/clang/test/CodeGenHIP/default-attributes.hip +++ b/clang/test/CodeGenHIP/default-attributes.hip @@ -8,6 +8,15 @@ #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) +//. +// OPTNONE: @__hip_cuid_ = addrspace(1) global i8 0 +// OPTNONE: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata" +// OPTNONE: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 +//. +// OPT: @__hip_cuid_ = addrspace(1) global i8 0 +// OPT: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 +// OPT: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata" +//. // OPTNONE: Function Attrs: convergent mustprogress noinline nounwind optnone // OPTNONE-LABEL: define {{[^@]+}}@_Z4funcv // OPTNONE-SAME: () #[[ATTR0:[0-9]+]] { @@ -40,17 +49,17 @@ __global__ void kernel() { } //. -// OPTNONE: attributes #0 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPTNONE: attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPTNONE: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } //. -// OPT: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// OPT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } +// OPT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } //. -// OPTNONE: !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} -// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} -// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4} +// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} +// OPTNONE: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPTNONE: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} //. -// OPT: !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} -// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} -// OPT: !2 = !{i32 1, !"wchar_size", i32 4} +// OPT: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} +// OPT: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPT: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} //. diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 1dd7fce2334c9..55b3b486d705d 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -139,6 +139,10 @@ Changes to the AMDGPU Backend :ref:`atomicrmw ` instruction with `fadd`, `fmin` and `fmax` with addrspace(3) instead. +* AMDGPUAttributor is no longer run as part of the codegen pass + pipeline. It is expected to run as part of the middle end + optimizations. + Changes to the ARM Backend -------------------------- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f50a18ccc2188..9ddf0a310ed06 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -731,6 +731,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); }); + // FIXME: Why is AMDGPUAttributor not in CGSCC? + PB.registerOptimizerLastEPCallback( + [this](ModulePassManager &MPM, OptimizationLevel Level) { + if (Level != OptimizationLevel::O0) { + MPM.addPass(AMDGPUAttributorPass(*this)); + } + }); + PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { // We want to support the -lto-partitions=N option as "best effort". @@ -1037,11 +1045,6 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAMDGPULowerModuleLDSLegacyPass(&TM)); } - // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run - // after their introduction - if (TM.getOptLevel() > CodeGenOptLevel::None) - addPass(createAMDGPUAttributorLegacyPass()); - if (TM.getOptLevel() > CodeGenOptLevel::None) addPass(createInferAddressSpacesPass()); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 97a8ff4486609..8c951105101d9 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -679,6 +679,12 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, break; } } + + // FIXME: We can spill incoming arguments and restore at the end of the + // prolog. + if (!ScratchWaveOffsetReg) + report_fatal_error( + "could not find temporary scratch offset register in prolog"); } else { ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll index a38b6e3263882..359c1e53de99e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: s_add_u64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s6, s0 @@ -22,8 +22,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-LABEL: s_add_u64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] @@ -58,8 +58,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: s_sub_u64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s0, s6, s0 @@ -74,8 +74,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-LABEL: s_sub_u64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 9be8620b024eb..0a8e805027c77 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -2026,7 +2026,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v1, v1 @@ -2056,7 +2056,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 @@ -2083,7 +2083,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v3, v1, v1 @@ -2114,10 +2114,14 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s16 +; GFX10-NEXT: s_mov_b32 s7, s17 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2143,7 +2147,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 @@ -2169,7 +2177,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: s_mov_b32 s4, s6 +; GFX908-NEXT: s_mov_b32 s5, s7 +; GFX908-NEXT: s_mov_b32 s6, s16 +; GFX908-NEXT: s_mov_b32 s7, s17 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 ; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2196,7 +2208,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s16 +; GFX8-NEXT: s_mov_b32 s7, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -2223,7 +2239,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s7 +; GFX7-NEXT: s_mov_b32 s6, s16 +; GFX7-NEXT: s_mov_b32 s7, s17 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-NEXT: v_mov_b32_e32 v1, v0 ; GFX7-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2258,7 +2278,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_num_f32 v3, v0, v0 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2285,7 +2305,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2311,7 +2331,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_f32 v3, v0, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2339,10 +2359,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s16 +; GFX10-NEXT: s_mov_b32 s7, s17 ; GFX10-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2367,7 +2391,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 ; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2392,7 +2420,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: s_mov_b32 s4, s6 +; GFX908-NEXT: s_mov_b32 s5, s7 +; GFX908-NEXT: s_mov_b32 s6, s16 +; GFX908-NEXT: s_mov_b32 s7, s17 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 ; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2418,7 +2450,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s16 +; GFX8-NEXT: s_mov_b32 s7, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2444,7 +2480,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s7 +; GFX7-NEXT: s_mov_b32 s6, s16 +; GFX7-NEXT: s_mov_b32 s7, s17 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2478,7 +2518,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_mov_b32_e32 v6, s6 ; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen @@ -2509,7 +2549,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s4 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 ; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, v1 ; GFX940-NEXT: buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen @@ -2538,7 +2578,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen @@ -2570,11 +2610,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: v_mov_b32_e32 v6, s18 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s16 +; GFX10-NEXT: s_mov_b32 s7, s17 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2603,7 +2647,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: v_mov_b32_e32 v6, s18 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2631,7 +2679,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: s_mov_b32 s4, s6 +; GFX908-NEXT: s_mov_b32 s5, s7 +; GFX908-NEXT: s_mov_b32 s6, s16 +; GFX908-NEXT: s_mov_b32 s7, s17 +; GFX908-NEXT: v_mov_b32_e32 v6, s18 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2662,7 +2714,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s16 +; GFX8-NEXT: s_mov_b32 s7, s17 +; GFX8-NEXT: v_mov_b32_e32 v6, s18 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2693,7 +2749,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s7 +; GFX7-NEXT: s_mov_b32 s6, s16 +; GFX7-NEXT: s_mov_b32 s7, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2732,7 +2792,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_mov_b32_e32 v6, s6 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen @@ -2761,7 +2821,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s4 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 ; GFX940-NEXT: buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2788,7 +2848,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen @@ -2818,10 +2878,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: v_mov_b32_e32 v6, s18 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s16 +; GFX10-NEXT: s_mov_b32 s7, s17 ; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2849,7 +2913,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: v_mov_b32_e32 v6, s18 ; GFX90A-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2875,7 +2943,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: s_mov_b32 s4, s6 +; GFX908-NEXT: s_mov_b32 s5, s7 +; GFX908-NEXT: s_mov_b32 s6, s16 +; GFX908-NEXT: s_mov_b32 s7, s17 +; GFX908-NEXT: v_mov_b32_e32 v6, s18 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2904,7 +2976,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s16 +; GFX8-NEXT: s_mov_b32 s7, s17 +; GFX8-NEXT: v_mov_b32_e32 v6, s18 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -2933,7 +3009,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s7 +; GFX7-NEXT: s_mov_b32 s6, s16 +; GFX7-NEXT: s_mov_b32 s7, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 97d68d9c2e621..fb81176a7419e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -2026,7 +2026,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v1, v1 @@ -2056,7 +2056,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 @@ -2083,7 +2083,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v3, v1, v1 @@ -2114,10 +2114,14 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s16 +; GFX10-NEXT: s_mov_b32 s7, s17 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2143,7 +2147,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 @@ -2169,7 +2177,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: s_mov_b32 s4, s6 +; GFX908-NEXT: s_mov_b32 s5, s7 +; GFX908-NEXT: s_mov_b32 s6, s16 +; GFX908-NEXT: s_mov_b32 s7, s17 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 ; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2196,7 +2208,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s16 +; GFX8-NEXT: s_mov_b32 s7, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -2223,7 +2239,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s7 +; GFX7-NEXT: s_mov_b32 s6, s16 +; GFX7-NEXT: s_mov_b32 s7, s17 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-NEXT: v_mov_b32_e32 v1, v0 ; GFX7-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2258,7 +2278,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_num_f32 v3, v0, v0 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2285,7 +2305,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2311,7 +2331,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_f32 v3, v0, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2339,10 +2359,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s16 +; GFX10-NEXT: s_mov_b32 s7, s17 ; GFX10-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2367,7 +2391,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 ; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2392,7 +2420,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: s_mov_b32 s4, s6 +; GFX908-NEXT: s_mov_b32 s5, s7 +; GFX908-NEXT: s_mov_b32 s6, s16 +; GFX908-NEXT: s_mov_b32 s7, s17 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 ; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2418,7 +2450,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s16 +; GFX8-NEXT: s_mov_b32 s7, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2444,7 +2480,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s7 +; GFX7-NEXT: s_mov_b32 s6, s16 +; GFX7-NEXT: s_mov_b32 s7, s17 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2478,7 +2518,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_mov_b32_e32 v6, s6 ; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen @@ -2509,7 +2549,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s4 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 ; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, v1 ; GFX940-NEXT: buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen @@ -2538,7 +2578,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen @@ -2570,11 +2610,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: v_mov_b32_e32 v6, s18 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s16 +; GFX10-NEXT: s_mov_b32 s7, s17 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2603,7 +2647,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: v_mov_b32_e32 v6, s18 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2631,7 +2679,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: s_mov_b32 s4, s6 +; GFX908-NEXT: s_mov_b32 s5, s7 +; GFX908-NEXT: s_mov_b32 s6, s16 +; GFX908-NEXT: s_mov_b32 s7, s17 +; GFX908-NEXT: v_mov_b32_e32 v6, s18 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2662,7 +2714,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s16 +; GFX8-NEXT: s_mov_b32 s7, s17 +; GFX8-NEXT: v_mov_b32_e32 v6, s18 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2693,7 +2749,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s7 +; GFX7-NEXT: s_mov_b32 s6, s16 +; GFX7-NEXT: s_mov_b32 s7, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2732,7 +2792,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_mov_b32_e32 v6, s6 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen @@ -2761,7 +2821,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s4 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 ; GFX940-NEXT: buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2788,7 +2848,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen @@ -2818,10 +2878,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: v_mov_b32_e32 v6, s18 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s16 +; GFX10-NEXT: s_mov_b32 s7, s17 ; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2849,7 +2913,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: v_mov_b32_e32 v6, s18 ; GFX90A-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2875,7 +2943,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: s_mov_b32 s4, s6 +; GFX908-NEXT: s_mov_b32 s5, s7 +; GFX908-NEXT: s_mov_b32 s6, s16 +; GFX908-NEXT: s_mov_b32 s7, s17 +; GFX908-NEXT: v_mov_b32_e32 v6, s18 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2904,7 +2976,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: s_mov_b32 s4, s6 +; GFX8-NEXT: s_mov_b32 s5, s7 +; GFX8-NEXT: s_mov_b32 s6, s16 +; GFX8-NEXT: s_mov_b32 s7, s17 +; GFX8-NEXT: v_mov_b32_e32 v6, s18 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -2933,7 +3009,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s7 +; GFX7-NEXT: s_mov_b32 s6, s16 +; GFX7-NEXT: s_mov_b32 s7, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index b04bc04ab2269..705bcbddf227a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -16,8 +16,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -31,8 +31,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,8 +46,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -59,11 +59,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -74,11 +74,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: ds_dec_rtn_u32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -95,8 +95,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -110,8 +110,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -125,8 +125,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -138,11 +138,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -153,11 +153,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -175,7 +175,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -197,7 +197,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -207,7 +207,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -218,7 +218,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_dec_u32 v0, v1 @@ -232,7 +232,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -254,7 +254,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -264,7 +264,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: ds_dec_u32 v1, v0 offset:16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -305,7 +305,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -320,7 +320,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -332,7 +332,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -345,7 +345,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] glc @@ -364,7 +364,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -381,7 +381,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -410,7 +410,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -423,7 +423,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -443,7 +443,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -460,7 +460,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -477,7 +477,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -489,7 +489,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -502,7 +502,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -522,7 +522,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -534,7 +534,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -569,7 +569,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] @@ -584,7 +584,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -598,7 +598,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -612,7 +612,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -623,7 +623,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -635,7 +635,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 @@ -651,7 +651,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -665,7 +665,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -679,7 +679,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -690,7 +690,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -702,7 +702,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 @@ -718,7 +718,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -740,7 +740,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -762,7 +762,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -774,7 +774,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -787,8 +787,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, v1, s[2:3] offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -810,7 +812,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -827,7 +829,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -844,7 +846,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -855,7 +857,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -867,8 +869,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] offset:20 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -885,7 +889,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -900,7 +904,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -915,7 +919,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -930,7 +934,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -946,7 +950,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -965,7 +969,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -982,7 +986,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -999,7 +1003,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1014,7 +1018,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -1032,7 +1036,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1052,7 +1056,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -1069,7 +1073,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1086,7 +1090,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1101,7 +1105,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -1119,7 +1123,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1139,7 +1143,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1151,7 +1155,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1163,7 +1167,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1175,7 +1179,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1189,7 +1193,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1206,7 +1210,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1220,7 +1224,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1234,7 +1238,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1246,7 +1250,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -1262,7 +1266,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1280,7 +1284,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1294,7 +1298,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1308,7 +1312,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1320,7 +1324,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -1336,7 +1340,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1354,7 +1358,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1376,7 +1380,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1398,7 +1402,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1418,7 +1422,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1441,12 +1445,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1470,7 +1476,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1487,7 +1493,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1504,7 +1510,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1519,7 +1525,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1538,12 +1544,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:20 @@ -1562,7 +1570,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1583,7 +1591,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1604,7 +1612,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1620,7 +1628,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1637,7 +1645,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -1657,7 +1665,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1680,7 +1688,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1703,7 +1711,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1719,7 +1727,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1738,7 +1746,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -1759,7 +1767,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1772,7 +1780,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1785,7 +1793,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1798,7 +1806,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1813,7 +1821,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1831,7 +1839,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1846,7 +1854,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1861,7 +1869,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1874,7 +1882,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1891,7 +1899,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1910,7 +1918,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1925,7 +1933,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1940,7 +1948,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1953,7 +1961,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1970,7 +1978,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1989,7 +1997,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -2015,7 +2023,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -2041,7 +2049,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2062,7 +2070,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -2086,14 +2094,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2117,7 +2126,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2135,7 +2144,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2153,7 +2162,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2169,7 +2178,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2189,14 +2198,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:40 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2214,7 +2224,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_dec_shl_base_lds_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2232,7 +2242,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_shl_base_lds_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2250,7 +2260,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_dec_shl_base_lds_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2266,7 +2276,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 9 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 @@ -2279,8 +2289,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: atomic_dec_shl_base_lds_0: ; GFX11: ; %bb.0: +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 @@ -2305,8 +2317,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2321,8 +2333,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2337,8 +2349,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2351,12 +2363,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2367,12 +2379,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -2389,8 +2401,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2405,8 +2417,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2421,8 +2433,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2435,12 +2447,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2451,12 +2463,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -2474,7 +2486,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2486,7 +2498,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2498,7 +2510,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2509,7 +2521,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2521,7 +2533,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -2536,7 +2548,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2548,7 +2560,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2560,7 +2572,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2571,7 +2583,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2583,7 +2595,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -2599,7 +2611,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2615,7 +2627,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2631,7 +2643,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2644,7 +2656,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2658,7 +2670,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2678,7 +2690,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2696,7 +2708,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2714,7 +2726,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2727,7 +2739,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2741,7 +2753,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2762,7 +2774,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2780,7 +2792,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2798,7 +2810,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2811,7 +2823,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2825,7 +2837,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2846,7 +2858,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2859,7 +2871,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2872,7 +2884,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2884,7 +2896,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2897,7 +2909,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2913,7 +2925,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2928,7 +2940,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2943,7 +2955,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2955,7 +2967,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2968,7 +2980,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2985,7 +2997,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3000,7 +3012,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3015,7 +3027,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3027,7 +3039,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -3040,7 +3052,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3057,7 +3069,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3080,7 +3092,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -3103,7 +3115,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -3116,7 +3128,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -3130,15 +3142,17 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 42 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -3154,7 +3168,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -3172,7 +3186,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3190,7 +3204,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -3202,7 +3216,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -3215,11 +3229,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[0:1] offset:40 +; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:40 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3234,7 +3250,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_dec_shl_base_lds_0_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -3253,7 +3269,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_dec_shl_base_lds_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -3273,7 +3289,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3290,7 +3306,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -3303,18 +3319,21 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; GFX11-LABEL: atomic_dec_shl_base_lds_0_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v1, 9 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 9 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v3, v[0:1] offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] -; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-NEXT: global_store_b32 v3, v2, s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index f6a997fb0fb01..b3a7e65f771c4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -16,8 +16,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -31,8 +31,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,8 +46,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -59,11 +59,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -74,11 +74,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -95,8 +95,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -110,8 +110,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -125,8 +125,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -138,11 +138,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -153,11 +153,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -175,7 +175,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -197,7 +197,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -207,7 +207,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -218,7 +218,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_inc_u32 v0, v1 @@ -232,7 +232,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -254,7 +254,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -264,7 +264,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: ds_inc_u32 v1, v0 offset:16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -305,7 +305,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -320,7 +320,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -332,7 +332,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -345,7 +345,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] glc @@ -364,7 +364,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -381,7 +381,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -410,7 +410,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -423,7 +423,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -443,7 +443,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -460,7 +460,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -477,7 +477,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -489,7 +489,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -502,7 +502,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -522,7 +522,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -534,7 +534,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -569,7 +569,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] @@ -584,7 +584,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -598,7 +598,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -612,7 +612,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -623,7 +623,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -635,7 +635,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 @@ -651,7 +651,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -665,7 +665,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -679,7 +679,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -690,7 +690,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -702,7 +702,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 @@ -718,7 +718,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -740,7 +740,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -762,7 +762,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -774,7 +774,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -787,8 +787,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -810,7 +812,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -827,7 +829,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -844,7 +846,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -855,7 +857,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -867,8 +869,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -885,7 +889,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 @@ -903,7 +907,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 @@ -921,7 +925,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -937,7 +941,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 9 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 @@ -950,8 +954,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; GFX11-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX11: ; %bb.0: +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 @@ -976,8 +982,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -992,8 +998,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1008,8 +1014,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1022,12 +1028,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1038,12 +1044,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -1060,8 +1066,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1076,8 +1082,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1092,8 +1098,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1106,12 +1112,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1122,12 +1128,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -1145,7 +1151,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1157,7 +1163,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1169,7 +1175,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1180,7 +1186,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1192,7 +1198,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -1207,7 +1213,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: s_load_dword s0, s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1219,7 +1225,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s0, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1231,7 +1237,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1242,7 +1248,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1254,7 +1260,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -1270,7 +1276,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1286,7 +1292,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1302,7 +1308,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1315,7 +1321,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1329,7 +1335,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1349,7 +1355,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1367,7 +1373,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1385,7 +1391,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1398,7 +1404,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1412,7 +1418,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1433,7 +1439,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1451,7 +1457,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1469,7 +1475,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1482,7 +1488,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1496,7 +1502,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1517,7 +1523,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1530,7 +1536,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1543,7 +1549,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1555,7 +1561,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1568,7 +1574,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1584,7 +1590,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1599,7 +1605,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1614,7 +1620,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1626,7 +1632,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1639,7 +1645,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1656,7 +1662,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1671,7 +1677,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,7 +1692,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1698,7 +1704,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1711,7 +1717,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1728,7 +1734,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1751,7 +1757,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1774,7 +1780,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -1787,7 +1793,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -1801,15 +1807,17 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 42 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1825,7 +1833,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1843,7 +1851,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1861,7 +1869,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1873,7 +1881,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1886,11 +1894,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[0:1] offset:40 +; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:40 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -1905,7 +1915,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1920,7 +1930,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1935,7 +1945,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1950,7 +1960,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -1966,7 +1976,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1985,7 +1995,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2002,7 +2012,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2019,7 +2029,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2034,7 +2044,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -2052,7 +2062,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2072,7 +2082,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2089,7 +2099,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2106,7 +2116,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2121,7 +2131,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -2139,7 +2149,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2159,7 +2169,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2171,7 +2181,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2183,7 +2193,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2195,7 +2205,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2209,7 +2219,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2226,7 +2236,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2240,7 +2250,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2254,7 +2264,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2266,7 +2276,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -2282,7 +2292,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2300,7 +2310,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2314,7 +2324,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2328,7 +2338,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2340,7 +2350,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -2356,7 +2366,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2374,7 +2384,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2396,7 +2406,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2418,7 +2428,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2438,7 +2448,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2461,12 +2471,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2490,7 +2502,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2507,7 +2519,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2524,7 +2536,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2539,7 +2551,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2558,12 +2570,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 @@ -2582,7 +2596,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -2601,7 +2615,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -2621,7 +2635,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2638,7 +2652,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -2651,18 +2665,21 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; GFX11-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v1, 9 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 9 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] -; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-NEXT: global_store_b32 v3, v2, s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2678,7 +2695,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2699,7 +2716,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2720,7 +2737,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2736,7 +2753,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2753,7 +2770,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2773,7 +2790,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2796,7 +2813,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2819,7 +2836,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2835,7 +2852,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2854,7 +2871,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2875,7 +2892,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2898,7 +2915,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2921,7 +2938,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2937,7 +2954,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2956,7 +2973,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2977,7 +2994,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2990,7 +3007,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3003,7 +3020,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3016,7 +3033,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3031,7 +3048,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3049,7 +3066,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3064,7 +3081,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3079,7 +3096,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3092,7 +3109,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3109,7 +3126,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3128,7 +3145,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3143,7 +3160,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3158,7 +3175,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3171,7 +3188,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3188,7 +3205,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3207,7 +3224,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3233,7 +3250,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -3259,7 +3276,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3280,7 +3297,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -3304,14 +3321,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3335,7 +3353,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -3353,7 +3371,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3371,7 +3389,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3387,7 +3405,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -3407,14 +3425,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3432,12 +3451,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: nocse_lds_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s6, s[4:5], 0x4 -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s6 +; CI-NEXT: v_mov_b32_e32 v1, s4 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0 @@ -3452,12 +3471,12 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; VI-LABEL: nocse_lds_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0 @@ -3472,11 +3491,11 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 @@ -3488,11 +3507,11 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX10-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3507,10 +3526,10 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX11-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll index bb5ccc3657dc4..c45bccd184c12 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll @@ -66,7 +66,7 @@ define amdgpu_ps i32 @select_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inreg %a.1, define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; WAVE64-LABEL: sgpr_trunc_brcond: ; WAVE64: ; %bb.0: ; %entry -; WAVE64-NEXT: s_load_dword s0, s[0:1], 0x24 +; WAVE64-NEXT: s_load_dword s0, s[2:3], 0x24 ; WAVE64-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-NEXT: s_xor_b32 s0, s0, 1 ; WAVE64-NEXT: s_and_b32 s0, s0, 1 @@ -83,7 +83,7 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; ; WAVE32-LABEL: sgpr_trunc_brcond: ; WAVE32: ; %bb.0: ; %entry -; WAVE32-NEXT: s_load_dword s0, s[0:1], 0x24 +; WAVE32-NEXT: s_load_dword s0, s[2:3], 0x24 ; WAVE32-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-NEXT: s_xor_b32 s0, s0, 1 ; WAVE32-NEXT: s_and_b32 s0, s0, 1 @@ -113,7 +113,7 @@ bb1: define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; WAVE64-LABEL: brcond_sgpr_trunc_and: ; WAVE64: ; %bb.0: ; %entry -; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; WAVE64-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-NEXT: s_and_b32 s0, s0, s1 ; WAVE64-NEXT: s_xor_b32 s0, s0, 1 @@ -131,7 +131,7 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; ; WAVE32-LABEL: brcond_sgpr_trunc_and: ; WAVE32: ; %bb.0: ; %entry -; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; WAVE32-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-NEXT: s_and_b32 s0, s0, s1 ; WAVE32-NEXT: s_xor_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 24652982c6584..e4c609c933108 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -12,9 +12,9 @@ declare hidden void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) define amdgpu_kernel void @kernel_caller_stack() { ; MUBUF-LABEL: kernel_caller_stack: ; MUBUF: ; %bb.0: -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s15 ; MUBUF-NEXT: s_mov_b32 s32, 0 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 @@ -34,8 +34,8 @@ define amdgpu_kernel void @kernel_caller_stack() { ; FLATSCR-LABEL: kernel_caller_stack: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_mov_b32 s32, 0 -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: s_add_u32 s0, s32, 4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 @@ -60,9 +60,9 @@ define amdgpu_kernel void @kernel_caller_stack() { define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-LABEL: kernel_caller_byval: ; MUBUF: ; %bb.0: -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s15 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -155,9 +155,9 @@ define amdgpu_kernel void @kernel_caller_byval() { ; ; FLATSCR-LABEL: kernel_caller_byval: ; FLATSCR: ; %bb.0: -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index eb20178f9f4d8..405b1e8f3a250 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -452,7 +452,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -468,7 +468,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -493,7 +493,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -513,7 +513,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -539,7 +539,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -562,7 +562,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -589,7 +589,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -612,7 +612,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -644,7 +644,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -679,7 +679,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -725,14 +725,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] @@ -769,17 +769,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -821,7 +821,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -858,7 +858,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -949,7 +949,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -986,7 +986,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1005,7 +1005,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1051,7 +1051,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1204,7 +1204,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1221,7 +1221,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1310,7 +1310,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1337,7 +1337,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1354,7 +1354,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 78d908455e019..5515de0cd2fee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -193,7 +193,7 @@ bb12: define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-LABEL: break_loop: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index 96db1f889690d..48986ea9ef982 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -25,8 +25,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s9 +; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s15 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -42,7 +42,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s32, 16 ; GFX11-NEXT: s_mov_b32 s33, 0 @@ -143,8 +143,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() { define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -160,8 +160,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s9 +; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s15 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -177,7 +177,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s32, 16 ; GFX11-NEXT: s_mov_b32 s33, 0 @@ -278,8 +278,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() { define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -296,8 +296,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s9 +; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s15 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -314,7 +314,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 32 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s33, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 1e1c90d142a1f..34efb089b72bf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3037,21 +3037,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 6 +; GPRIDX-NEXT: user_sgpr_count = 10 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 0 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_info = 0 -; GPRIDX-NEXT: enable_vgpr_workitem_id = 0 +; GPRIDX-NEXT: enable_vgpr_workitem_id = 2 ; GPRIDX-NEXT: enable_exception_msb = 0 ; GPRIDX-NEXT: granulated_lds_size = 0 ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 0 +; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_id = 0 +; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3067,7 +3067,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 12 +; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 ; GPRIDX-NEXT: wavefront_sgpr_count = 13 ; GPRIDX-NEXT: workitem_vgpr_count = 3 @@ -3085,8 +3085,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GPRIDX-NEXT: s_load_dword s8, s[6:7], 0x8 ; GPRIDX-NEXT: s_mov_b32 s4, 0 ; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000 ; GPRIDX-NEXT: s_mov_b32 s2, 0 @@ -3128,21 +3128,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 6 +; MOVREL-NEXT: user_sgpr_count = 10 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 0 -; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 +; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_info = 0 -; MOVREL-NEXT: enable_vgpr_workitem_id = 0 +; MOVREL-NEXT: enable_vgpr_workitem_id = 2 ; MOVREL-NEXT: enable_exception_msb = 0 ; MOVREL-NEXT: granulated_lds_size = 0 ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 0 +; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_id = 0 +; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3158,7 +3158,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 12 +; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 ; MOVREL-NEXT: wavefront_sgpr_count = 9 ; MOVREL-NEXT: workitem_vgpr_count = 4 @@ -3176,8 +3176,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 ; MOVREL-NEXT: s_mov_b32 s4, 0 ; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 ; MOVREL-NEXT: s_mov_b32 s2, 0 @@ -3209,7 +3209,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -3220,21 +3220,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 6 +; GFX10-NEXT: user_sgpr_count = 10 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 -; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 1 ; GFX10-NEXT: enable_sgpr_workgroup_info = 0 -; GFX10-NEXT: enable_vgpr_workitem_id = 0 +; GFX10-NEXT: enable_vgpr_workitem_id = 2 ; GFX10-NEXT: enable_exception_msb = 0 ; GFX10-NEXT: granulated_lds_size = 0 ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 -; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX10-NEXT: enable_sgpr_dispatch_id = 0 +; GFX10-NEXT: enable_sgpr_dispatch_id = 1 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3250,9 +3250,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 12 +; GFX10-NEXT: kernarg_segment_byte_size = 28 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 7 +; GFX10-NEXT: wavefront_sgpr_count = 9 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -3269,21 +3269,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s8, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_cmp_eq_u32 s8, 1 ; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: s_cmp_eq_u32 s8, 2 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_cmp_eq_u32 s8, 3 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s5, 0x40140000 ; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] -; GFX10-NEXT: s_cmp_eq_u32 s6, 4 +; GFX10-NEXT: s_cmp_eq_u32 s8, 4 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -3312,21 +3312,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: enable_mem_ordered = 1 ; GFX11-NEXT: enable_fwd_progress = 0 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX11-NEXT: user_sgpr_count = 15 +; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 ; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 -; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 1 ; GFX11-NEXT: enable_sgpr_workgroup_info = 0 -; GFX11-NEXT: enable_vgpr_workitem_id = 0 +; GFX11-NEXT: enable_vgpr_workitem_id = 2 ; GFX11-NEXT: enable_exception_msb = 0 ; GFX11-NEXT: granulated_lds_size = 0 ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 -; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 ; GFX11-NEXT: enable_sgpr_queue_ptr = 0 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX11-NEXT: enable_sgpr_dispatch_id = 0 +; GFX11-NEXT: enable_sgpr_dispatch_id = 1 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX11-NEXT: enable_sgpr_private_segment_size = 0 ; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3342,7 +3342,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: workitem_private_segment_byte_size = 0 ; GFX11-NEXT: workgroup_group_segment_byte_size = 0 ; GFX11-NEXT: gds_segment_byte_size = 0 -; GFX11-NEXT: kernarg_segment_byte_size = 12 +; GFX11-NEXT: kernarg_segment_byte_size = 28 ; GFX11-NEXT: workgroup_fbarrier_count = 0 ; GFX11-NEXT: wavefront_sgpr_count = 7 ; GFX11-NEXT: workitem_vgpr_count = 3 @@ -3361,8 +3361,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x40080000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -4054,21 +4054,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 6 +; GPRIDX-NEXT: user_sgpr_count = 10 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 0 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_info = 0 -; GPRIDX-NEXT: enable_vgpr_workitem_id = 0 +; GPRIDX-NEXT: enable_vgpr_workitem_id = 2 ; GPRIDX-NEXT: enable_exception_msb = 0 ; GPRIDX-NEXT: granulated_lds_size = 0 ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 0 +; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_id = 0 +; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4084,9 +4084,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 12 +; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 10 +; GPRIDX-NEXT: wavefront_sgpr_count = 12 ; GPRIDX-NEXT: workitem_vgpr_count = 2 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4102,8 +4102,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dword s2, s[4:5], 0x8 -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GPRIDX-NEXT: s_load_dword s2, s[6:7], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s2, 1 @@ -4138,21 +4138,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 6 +; MOVREL-NEXT: user_sgpr_count = 10 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 0 -; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 +; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_info = 0 -; MOVREL-NEXT: enable_vgpr_workitem_id = 0 +; MOVREL-NEXT: enable_vgpr_workitem_id = 2 ; MOVREL-NEXT: enable_exception_msb = 0 ; MOVREL-NEXT: granulated_lds_size = 0 ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 0 +; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_id = 0 +; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4168,9 +4168,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 12 +; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 6 +; MOVREL-NEXT: wavefront_sgpr_count = 8 ; MOVREL-NEXT: workitem_vgpr_count = 3 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4186,8 +4186,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dword s2, s[4:5], 0x8 -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; MOVREL-NEXT: s_load_dword s2, s[6:7], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s2, 1 ; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0 @@ -4223,21 +4223,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 6 +; GFX10-NEXT: user_sgpr_count = 10 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 -; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 1 ; GFX10-NEXT: enable_sgpr_workgroup_info = 0 -; GFX10-NEXT: enable_vgpr_workitem_id = 0 +; GFX10-NEXT: enable_vgpr_workitem_id = 2 ; GFX10-NEXT: enable_exception_msb = 0 ; GFX10-NEXT: granulated_lds_size = 0 ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 -; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX10-NEXT: enable_sgpr_dispatch_id = 0 +; GFX10-NEXT: enable_sgpr_dispatch_id = 1 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4253,9 +4253,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 12 +; GFX10-NEXT: kernarg_segment_byte_size = 28 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 6 +; GFX10-NEXT: wavefront_sgpr_count = 8 ; GFX10-NEXT: workitem_vgpr_count = 2 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -4272,8 +4272,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 @@ -4308,21 +4308,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_mem_ordered = 1 ; GFX11-NEXT: enable_fwd_progress = 0 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX11-NEXT: user_sgpr_count = 15 +; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 ; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 -; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 1 ; GFX11-NEXT: enable_sgpr_workgroup_info = 0 -; GFX11-NEXT: enable_vgpr_workitem_id = 0 +; GFX11-NEXT: enable_vgpr_workitem_id = 2 ; GFX11-NEXT: enable_exception_msb = 0 ; GFX11-NEXT: granulated_lds_size = 0 ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 -; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 ; GFX11-NEXT: enable_sgpr_queue_ptr = 0 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX11-NEXT: enable_sgpr_dispatch_id = 0 +; GFX11-NEXT: enable_sgpr_dispatch_id = 1 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX11-NEXT: enable_sgpr_private_segment_size = 0 ; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4338,9 +4338,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: workitem_private_segment_byte_size = 0 ; GFX11-NEXT: workgroup_group_segment_byte_size = 0 ; GFX11-NEXT: gds_segment_byte_size = 0 -; GFX11-NEXT: kernarg_segment_byte_size = 12 +; GFX11-NEXT: kernarg_segment_byte_size = 28 ; GFX11-NEXT: workgroup_fbarrier_count = 0 -; GFX11-NEXT: wavefront_sgpr_count = 4 +; GFX11-NEXT: wavefront_sgpr_count = 5 ; GFX11-NEXT: workitem_vgpr_count = 2 ; GFX11-NEXT: reserved_vgpr_first = 0 ; GFX11-NEXT: reserved_vgpr_count = 0 @@ -4357,16 +4357,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s2, 1 -; GFX11-NEXT: s_cselect_b32 s3, 2.0, 1.0 -; GFX11-NEXT: s_cmp_eq_u32 s2, 2 -; GFX11-NEXT: s_cselect_b32 s3, 0x40400000, s3 -; GFX11-NEXT: s_cmp_eq_u32 s2, 3 -; GFX11-NEXT: s_cselect_b32 s2, 4.0, s3 +; GFX11-NEXT: s_cmp_eq_u32 s4, 1 +; GFX11-NEXT: s_cselect_b32 s2, 2.0, 1.0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 2 +; GFX11-NEXT: s_cselect_b32 s2, 0x40400000, s2 +; GFX11-NEXT: s_cmp_eq_u32 s4, 3 +; GFX11-NEXT: s_cselect_b32 s2, 4.0, s2 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -4401,21 +4401,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 6 +; GPRIDX-NEXT: user_sgpr_count = 10 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 0 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_info = 0 -; GPRIDX-NEXT: enable_vgpr_workitem_id = 0 +; GPRIDX-NEXT: enable_vgpr_workitem_id = 2 ; GPRIDX-NEXT: enable_exception_msb = 0 ; GPRIDX-NEXT: granulated_lds_size = 0 ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 0 +; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_id = 0 +; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4431,9 +4431,9 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 12 +; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 11 +; GPRIDX-NEXT: wavefront_sgpr_count = 13 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4449,17 +4449,17 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8 -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GPRIDX-NEXT: s_load_dword s8, s[6:7], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GPRIDX-NEXT: s_mov_b32 s2, 0 ; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 ; GPRIDX-NEXT: v_mov_b32_e32 v2, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) -; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 2 ; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GPRIDX-NEXT: s_cmp_eq_u32 s6, 3 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 3 ; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 @@ -4477,7 +4477,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 0 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4488,21 +4488,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 6 +; MOVREL-NEXT: user_sgpr_count = 10 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 0 -; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 +; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_info = 0 -; MOVREL-NEXT: enable_vgpr_workitem_id = 0 +; MOVREL-NEXT: enable_vgpr_workitem_id = 2 ; MOVREL-NEXT: enable_exception_msb = 0 ; MOVREL-NEXT: granulated_lds_size = 0 ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 0 +; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_id = 0 +; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4518,9 +4518,9 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 12 +; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 7 +; MOVREL-NEXT: wavefront_sgpr_count = 9 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4536,16 +4536,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dword s6, s[4:5], 0x8 -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; MOVREL-NEXT: s_cmp_eq_u32 s6, 1 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; MOVREL-NEXT: s_cmp_eq_u32 s6, 2 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; MOVREL-NEXT: s_cmp_eq_u32 s6, 3 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 @@ -4565,7 +4565,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -4576,21 +4576,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 6 +; GFX10-NEXT: user_sgpr_count = 10 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 -; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 1 ; GFX10-NEXT: enable_sgpr_workgroup_info = 0 -; GFX10-NEXT: enable_vgpr_workitem_id = 0 +; GFX10-NEXT: enable_vgpr_workitem_id = 2 ; GFX10-NEXT: enable_exception_msb = 0 ; GFX10-NEXT: granulated_lds_size = 0 ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 -; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX10-NEXT: enable_sgpr_dispatch_id = 0 +; GFX10-NEXT: enable_sgpr_dispatch_id = 1 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4606,9 +4606,9 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 12 +; GFX10-NEXT: kernarg_segment_byte_size = 28 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 7 +; GFX10-NEXT: wavefront_sgpr_count = 9 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -4625,17 +4625,17 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s8, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_cmp_eq_u32 s8, 1 ; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: s_cmp_eq_u32 s8, 2 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_cmp_eq_u32 s8, 3 ; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -4664,21 +4664,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_mem_ordered = 1 ; GFX11-NEXT: enable_fwd_progress = 0 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX11-NEXT: user_sgpr_count = 15 +; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 ; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 -; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 1 ; GFX11-NEXT: enable_sgpr_workgroup_info = 0 -; GFX11-NEXT: enable_vgpr_workitem_id = 0 +; GFX11-NEXT: enable_vgpr_workitem_id = 2 ; GFX11-NEXT: enable_exception_msb = 0 ; GFX11-NEXT: granulated_lds_size = 0 ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 -; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 ; GFX11-NEXT: enable_sgpr_queue_ptr = 0 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX11-NEXT: enable_sgpr_dispatch_id = 0 +; GFX11-NEXT: enable_sgpr_dispatch_id = 1 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX11-NEXT: enable_sgpr_private_segment_size = 0 ; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4694,7 +4694,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: workitem_private_segment_byte_size = 0 ; GFX11-NEXT: workgroup_group_segment_byte_size = 0 ; GFX11-NEXT: gds_segment_byte_size = 0 -; GFX11-NEXT: kernarg_segment_byte_size = 12 +; GFX11-NEXT: kernarg_segment_byte_size = 28 ; GFX11-NEXT: workgroup_fbarrier_count = 0 ; GFX11-NEXT: wavefront_sgpr_count = 7 ; GFX11-NEXT: workitem_vgpr_count = 3 @@ -4713,8 +4713,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x40080000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll index 7cd99fcfd5e74..9b9249b62b0bc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll @@ -1,6 +1,8 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,RW-FLAT %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GCN,RO-FLAT %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,RO-FLAT %s +; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 < %s | llc -mcpu=gfx900 | FileCheck -check-prefixes=GCN,RW-FLAT %s +; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | llc | FileCheck -check-prefixes=GCN,RO-FLAT %s +; RUN: opt -passes=amdgpu-attributor -mcpu=gfx940 < %s | llc | FileCheck -check-prefixes=GCN,RO-FLAT %s + +target triple = "amdgcn-amd-amdhsa" ; Make sure flat_scratch_init is set diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 63e7339d829e1..a5e4151bf3695 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -8,9 +8,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 @@ -26,11 +26,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s2, s2, s5 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 @@ -46,7 +46,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 @@ -62,7 +62,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 @@ -78,7 +78,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b32 s1, s0, 2 @@ -105,10 +105,10 @@ bb: define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: v_add_u32_e32 v1, 0, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -121,10 +121,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-LABEL: store_load_vindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -139,6 +139,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX940-LABEL: store_load_vindex_kernel: ; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 @@ -152,10 +153,12 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0, v1 @@ -165,9 +168,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -321,9 +325,9 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -341,11 +345,11 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s2, s2, s5 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -363,7 +367,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -381,7 +385,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: scratch_load_b32 v2, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -398,7 +402,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 @@ -430,8 +434,8 @@ bb: define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -449,10 +453,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -471,6 +475,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 @@ -484,11 +489,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:256 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 @@ -498,12 +506,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX12-LABEL: store_load_vindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 15 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:256 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:380 scope:SCOPE_SYS @@ -630,9 +639,9 @@ bb: define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -650,11 +659,11 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s2, s2, s5 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -672,7 +681,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -691,7 +700,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -709,7 +718,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 @@ -741,8 +750,8 @@ bb: define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -760,10 +769,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -782,6 +791,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 @@ -796,12 +806,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 @@ -811,12 +824,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX12-LABEL: store_load_vindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 15 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16508 scope:SCOPE_SYS @@ -945,8 +959,8 @@ bb: define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-LABEL: store_load_large_imm_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 @@ -962,10 +976,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3e80 @@ -1114,9 +1128,9 @@ bb: define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-LABEL: store_load_vidx_sidx_offset: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -1129,11 +1143,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-LABEL: store_load_vidx_sidx_offset: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s2, s2, s5 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -1146,7 +1160,8 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -1159,11 +1174,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1173,9 +1188,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-LABEL: store_load_vidx_sidx_offset: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 632dbd45279fb..2d3b6ee3e9823 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -7,14 +7,16 @@ declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x h ; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) +declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) +declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -25,7 +27,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -41,7 +43,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -82,10 +84,10 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -107,10 +109,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -132,12 +134,12 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3] +; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] ; GFX940-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) ret void @@ -154,6 +156,56 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> ret <2 x i16> %ret } +define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { +; GFX940-LABEL: local_atomic_fadd_v2f16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: ds_pk_add_f16 v0, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_endpgm + %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) + ret void +} + +define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { +; GFX940-LABEL: local_atomic_fadd_v2f16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) + ret <2 x half> %ret +} + +define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: ds_pk_add_f16 v0, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { ; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset: ; GFX940: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 66b22bedaf072..453b229bf62bd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -20,26 +20,27 @@ declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) +declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -71,12 +72,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -86,12 +87,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -107,22 +108,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -154,12 +155,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -169,12 +170,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -190,22 +191,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -237,12 +238,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -252,12 +253,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -273,22 +274,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -320,12 +321,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -335,12 +336,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -356,22 +357,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -403,12 +404,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -418,12 +419,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -439,22 +440,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -486,12 +487,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -501,12 +502,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -522,22 +523,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -569,12 +570,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -584,12 +585,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -605,22 +606,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -652,12 +653,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -667,12 +668,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -688,22 +689,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -735,12 +736,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -750,12 +751,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -771,22 +772,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -818,12 +819,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -833,12 +834,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -854,22 +855,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -901,12 +902,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -916,12 +917,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -937,22 +938,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -984,12 +985,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -999,12 +1000,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -1020,7 +1021,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1029,7 +1030,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1043,7 +1044,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1052,7 +1053,7 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1066,7 +1067,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1075,7 +1076,7 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1089,16 +1090,16 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: s_mov_b32 s4, s3 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: s_mov_b32 s0, s5 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s0, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB39_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[4:5] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 @@ -1125,22 +1126,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: s_mov_b32 s4, s3 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB39_2: @@ -1153,21 +1154,21 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: s_mov_b32 s4, s3 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB40_2: @@ -1175,22 +1176,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: s_mov_b32 s4, s3 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB40_2: @@ -1203,16 +1204,16 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: s_mov_b32 s4, s3 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: s_mov_b32 s0, s5 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s0, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB41_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[4:5] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 @@ -1239,22 +1240,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: s_mov_b32 s4, s3 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB41_2: @@ -1267,21 +1268,21 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: s_mov_b32 s4, s3 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB42_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB42_2: @@ -1289,22 +1290,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: s_mov_b32 s4, s3 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB42_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB42_2: @@ -1479,16 +1480,16 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: s_mov_b32 s4, s3 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: s_mov_b32 s0, s5 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s0, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB49_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[4:5] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 @@ -1513,22 +1514,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: s_mov_b32 s4, s3 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB49_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB49_2: @@ -1541,7 +1542,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] @@ -1565,7 +1566,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1582,7 +1583,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1594,7 +1595,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1611,7 +1612,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] @@ -1636,7 +1637,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1760,7 +1761,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -1769,7 +1770,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1802,7 +1803,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] @@ -1824,7 +1825,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1841,7 +1842,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -1850,7 +1851,7 @@ define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1883,7 +1884,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -1892,7 +1893,7 @@ define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1922,47 +1923,119 @@ main_body: ret double %ret } +define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB63_2 +; GFX90A-NEXT: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: ds_add_f64 v2, v[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB63_2: +; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: local_atomic_fadd_f64_noret: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB63_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; GFX940-NEXT: s_load_dword s6, s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: ds_add_f64 v2, v[0:1] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB63_2: +; GFX940-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) + ret void +} + +define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: local_atomic_fadd_f64_rtn: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) + ret double %ret +} + define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: s_mov_b32 s4, s3 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB63_2 +; GFX90A-NEXT: s_cbranch_execz .LBB65_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB63_2: +; GFX90A-NEXT: .LBB65_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: s_mov_b32 s4, s3 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB63_2 +; GFX940-NEXT: s_cbranch_execz .LBB65_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB63_2: +; GFX940-NEXT: .LBB65_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -1972,91 +2045,91 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: s_mov_b32 s4, s3 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB64_2 +; GFX90A-NEXT: s_cbranch_execz .LBB66_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB64_2: +; GFX90A-NEXT: .LBB66_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: s_mov_b32 s4, s3 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB64_2 +; GFX940-NEXT: s_cbranch_execz .LBB66_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB64_2: +; GFX940-NEXT: .LBB66_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void } -define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #2 { +define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: s_mov_b32 s4, s3 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB65_2 +; GFX90A-NEXT: s_cbranch_execz .LBB67_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB65_2: +; GFX90A-NEXT: .LBB67_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: s_mov_b32 s4, s3 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB65_2 +; GFX940-NEXT: s_cbranch_execz .LBB67_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB65_2: +; GFX940-NEXT: .LBB67_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2085,6 +2158,54 @@ main_body: ret double %ret } +define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, double %data) #2 { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) + ret double %ret +} + +define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double %data) #3 { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) + ret double %ret +} + attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" } -attributes #2 = { "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" } +attributes #3 = { "denormal-fp-math"="ieee,ieee" } +attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 05cdb54f5dd74..e051cc28469fa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2 @@ -37,8 +37,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8 @@ -67,8 +67,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2 @@ -87,8 +87,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8 @@ -113,8 +113,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2 @@ -133,8 +133,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8 @@ -159,8 +159,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -188,8 +188,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -225,8 +225,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -243,8 +243,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -269,8 +269,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { ; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -287,8 +287,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -313,8 +313,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -341,8 +341,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -376,8 +376,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -433,8 +433,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; CI-LABEL: unsafe_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -458,8 +458,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -491,8 +491,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -545,8 +545,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -588,8 +588,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 @@ -682,8 +682,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 @@ -747,8 +747,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 @@ -792,8 +792,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 @@ -845,8 +845,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 @@ -922,8 +922,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 @@ -1007,8 +1007,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 @@ -1050,8 +1050,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll index 388ef2497e435..fe2e7afb7048e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll @@ -952,9 +952,9 @@ define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %ar ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile load (s8) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p1) :: (volatile load (s32) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: %5:_(p5) = nuw nusw G_PTR_ADD [[COPY]], [[C]](s32) + ; CHECK-NEXT: %12:_(p5) = nuw nusw G_PTR_ADD [[COPY]], [[C]](s32) ; CHECK-NEXT: G_STORE [[LOAD]](s8), [[COPY]](p5) :: (store (s8) into %ir.arg0, addrspace 5) - ; CHECK-NEXT: G_STORE [[LOAD1]](s32), %5(p5) :: (store (s32) into %ir.gep1, addrspace 5) + ; CHECK-NEXT: G_STORE [[LOAD1]](s32), %12(p5) :: (store (s32) into %ir.gep1, addrspace 5) ; CHECK-NEXT: SI_RETURN %val0 = load volatile i8, ptr addrspace(1) undef %val1 = load volatile i32, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 8859ac69923a9..9443b39dcdc03 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 ; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_mov_b32 s4, s0 @@ -35,8 +35,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 ; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_mov_b32 s4, s0 @@ -59,7 +59,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 @@ -83,7 +83,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 @@ -114,9 +114,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 @@ -127,9 +127,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xcc +; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -140,7 +140,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 @@ -152,7 +152,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 @@ -170,9 +170,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 @@ -183,9 +183,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xc8 +; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -196,7 +196,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 @@ -208,7 +208,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 @@ -226,12 +226,12 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { define amdgpu_kernel void @llvm_trap() { ; GFX8V4-LABEL: llvm_trap: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V4-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX8V4-NEXT: s_trap 2 ; ; GFX8V5-LABEL: llvm_trap: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_trap 2 ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll index 136c51d775b43..696cbdb75f1ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll @@ -19,6 +19,9 @@ define amdgpu_kernel void @return_type_is_too_big_vector() { ; CHECK-LABEL: name: return_type_is_too_big_vector ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1 (%ir-block.0): ; CHECK-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index a1c99f5cf6029..db944b98a3013 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -7,9 +7,9 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr addrspace(1) %ptr, i32 %val, i32 %idx) #0 { ; GCN-LABEL: v_insert_v64i32_varidx: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[20:23], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x10 -; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_load_dwordx4 s[20:23], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x10 +; GCN-NEXT: s_add_u32 s0, s0, s13 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index 3abc21f812e14..5185f6c4ada5b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addrspace(1) %ptr.out) #0 { ; GCN-LABEL: v_insert_v64i32_37: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] @@ -53,7 +53,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addr ; ; GFX10-LABEL: v_insert_v64i32_37: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0xf @@ -101,7 +101,9 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addr ; ; GFX11-LABEL: v_insert_v64i32_37: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0xf diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll index e9292f4e34dcd..e67ada74c23e6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -global-isel %s -o - | FileCheck -check-prefix=HSA %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=amdgpu-attributor < %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -o - | FileCheck -check-prefix=HSA %s ; HSA-LABEL: name: default_kernel ; HSA: liveins: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll index f2fe815a71202..652d22ac1224f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll @@ -5,9 +5,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { ; HSA-VI-LABEL: name: i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -20,9 +20,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; LEGACY-MESA-VI-LABEL: name: i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -40,9 +40,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { ; HSA-VI-LABEL: name: i8_zext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -55,9 +55,9 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; LEGACY-MESA-VI-LABEL: name: i8_zext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -75,9 +75,9 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { ; HSA-VI-LABEL: name: i8_sext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -90,9 +90,9 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; LEGACY-MESA-VI-LABEL: name: i8_sext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -110,9 +110,9 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { ; HSA-VI-LABEL: name: i16_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -125,9 +125,9 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; LEGACY-MESA-VI-LABEL: name: i16_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -145,9 +145,9 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { ; HSA-VI-LABEL: name: i16_zext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -160,9 +160,9 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; LEGACY-MESA-VI-LABEL: name: i16_zext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -180,9 +180,9 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { ; HSA-VI-LABEL: name: i16_sext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -195,9 +195,9 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; LEGACY-MESA-VI-LABEL: name: i16_sext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -215,9 +215,9 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { ; HSA-VI-LABEL: name: i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -229,9 +229,9 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou ; ; LEGACY-MESA-VI-LABEL: name: i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -248,9 +248,9 @@ entry: define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { ; HSA-VI-LABEL: name: f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -262,9 +262,9 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n ; ; LEGACY-MESA-VI-LABEL: name: f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -281,9 +281,9 @@ entry: define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; HSA-VI-LABEL: name: v2i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -295,9 +295,9 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v2i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -314,9 +314,9 @@ entry: define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; HSA-VI-LABEL: name: v2i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -328,9 +328,9 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v2i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -347,9 +347,9 @@ entry: define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v2i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -361,9 +361,9 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v2i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -380,9 +380,9 @@ entry: define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { ; HSA-VI-LABEL: name: v2f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -394,9 +394,9 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; LEGACY-MESA-VI-LABEL: name: v2f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -413,9 +413,9 @@ entry: define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { ; HSA-VI-LABEL: name: v3i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -427,9 +427,9 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; ; LEGACY-MESA-VI-LABEL: name: v3i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -446,9 +446,9 @@ entry: define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { ; HSA-VI-LABEL: name: v3i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -460,9 +460,9 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; LEGACY-MESA-VI-LABEL: name: v3i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -479,9 +479,9 @@ entry: define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v3i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -493,9 +493,9 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v3i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -512,9 +512,9 @@ entry: define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { ; HSA-VI-LABEL: name: v3f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -526,9 +526,9 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; LEGACY-MESA-VI-LABEL: name: v3f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -545,9 +545,9 @@ entry: define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; HSA-VI-LABEL: name: v4i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -559,9 +559,9 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v4i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -578,9 +578,9 @@ entry: define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; HSA-VI-LABEL: name: v4i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -592,9 +592,9 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v4i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -611,9 +611,9 @@ entry: define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v4i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -625,9 +625,9 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v4i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -644,9 +644,9 @@ entry: define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { ; HSA-VI-LABEL: name: v4f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -658,9 +658,9 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; LEGACY-MESA-VI-LABEL: name: v4f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -677,9 +677,9 @@ entry: define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; HSA-VI-LABEL: name: v8i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -691,9 +691,9 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v8i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -710,9 +710,9 @@ entry: define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; HSA-VI-LABEL: name: v8i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -724,9 +724,9 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v8i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -743,9 +743,9 @@ entry: define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v8i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -757,9 +757,9 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v8i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -776,9 +776,9 @@ entry: define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { ; HSA-VI-LABEL: name: v8f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -790,9 +790,9 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; LEGACY-MESA-VI-LABEL: name: v8f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -809,9 +809,9 @@ entry: define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; HSA-VI-LABEL: name: v16i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -823,9 +823,9 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v16i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -842,9 +842,9 @@ entry: define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; HSA-VI-LABEL: name: v16i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -856,9 +856,9 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v16i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -875,9 +875,9 @@ entry: define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v16i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -889,9 +889,9 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; LEGACY-MESA-VI-LABEL: name: v16i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -908,9 +908,9 @@ entry: define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { ; HSA-VI-LABEL: name: v16f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -922,9 +922,9 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; LEGACY-MESA-VI-LABEL: name: v16f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -941,9 +941,9 @@ entry: define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { ; HSA-VI-LABEL: name: kernel_arg_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -955,9 +955,9 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; LEGACY-MESA-VI-LABEL: name: kernel_arg_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -973,9 +973,9 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; HSA-VI-LABEL: name: f64_kernel_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -987,9 +987,9 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; LEGACY-MESA-VI-LABEL: name: f64_kernel_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1006,9 +1006,9 @@ entry: define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1020,9 +1020,9 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: i1_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1038,9 +1038,9 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_zext_i32 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1053,9 +1053,9 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i32 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1073,9 +1073,9 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_zext_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1088,9 +1088,9 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1108,9 +1108,9 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_sext_i32 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1123,9 +1123,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i32 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1143,9 +1143,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_sext_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1158,9 +1158,9 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1180,9 +1180,9 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { ; HSA-VI-LABEL: name: empty_struct_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1192,9 +1192,9 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: empty_struct_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1208,9 +1208,9 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { ; HSA-VI-LABEL: name: empty_array_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1220,9 +1220,9 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: empty_array_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1244,9 +1244,9 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, {i32, i64} %arg1) { ; HSA-VI-LABEL: name: struct_argument_alignment ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1272,9 +1272,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, ; ; LEGACY-MESA-VI-LABEL: name: struct_argument_alignment ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1312,9 +1312,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr addrspace(1)} %arg0, i8 %pad, {ptr addrspace(3), ptr addrspace(1234)} %arg1) { ; HSA-VI-LABEL: name: pointer_in_struct_argument ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1340,9 +1340,9 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add ; ; LEGACY-MESA-VI-LABEL: name: pointer_in_struct_argument ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1382,9 +1382,9 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; HSA-VI-LABEL: name: packed_struct_argument_alignment ; HSA-VI: bb.1 (%ir-block.1): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1406,9 +1406,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; ; LEGACY-MESA-VI-LABEL: name: packed_struct_argument_alignment ; LEGACY-MESA-VI: bb.1 (%ir-block.1): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1441,16 +1441,16 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, define amdgpu_kernel void @unused_i32_arg(ptr addrspace(1) nocapture %out, i32 %unused, i32 %in) nounwind { ; HSA-VI-LABEL: name: unused_i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: S_ENDPGM 0 ; ; LEGACY-MESA-VI-LABEL: name: unused_i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: S_ENDPGM 0 entry: ret void @@ -1460,9 +1460,9 @@ entry: define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1475,9 +1475,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1496,9 +1496,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) align 2 %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i16_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1511,9 +1511,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i16_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1532,9 +1532,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align 4 %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1550,9 +1550,9 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1574,9 +1574,9 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) align(16) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_constant_v4i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1592,9 +1592,9 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_v4i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1616,9 +1616,9 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_align_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1634,9 +1634,9 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: byref_align_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1658,9 +1658,9 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg ; HSA-VI: bb.1 (%ir-block.1): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1676,9 +1676,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; ; LEGACY-MESA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.1): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1701,9 +1701,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_global_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1716,9 +1716,9 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ; ; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1736,9 +1736,9 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_flat_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1751,9 +1751,9 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p ; ; LEGACY-MESA-VI-LABEL: name: byref_flat_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1771,9 +1771,9 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_32bit_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1786,9 +1786,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_32bit_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1806,9 +1806,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(999) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_unknown_as_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1821,9 +1821,9 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture % ; ; LEGACY-MESA-VI-LABEL: name: byref_unknown_as_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1842,9 +1842,9 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(3) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_local_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1857,9 +1857,9 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, ; ; LEGACY-MESA-VI-LABEL: name: byref_local_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1877,9 +1877,9 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(4) %in0.byref, ptr addrspace(4) byref(i32) align(4) %in1.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: multi_byref_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1899,9 +1899,9 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: multi_byref_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1929,9 +1929,9 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i32_arg_offset0 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -1941,9 +1941,9 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg_offset0 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -1958,9 +1958,9 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { ; HSA-VI-LABEL: name: p3i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3), align 16, addrspace 4) @@ -1970,9 +1970,9 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: p3i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3), addrspace 4) @@ -1986,9 +1986,9 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { ; HSA-VI-LABEL: name: p1i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 9 ; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0 ; HSA-VI-NEXT: G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3) @@ -1996,9 +1996,9 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: p1i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 9 ; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0 ; LEGACY-MESA-VI-NEXT: G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3) @@ -2010,9 +2010,9 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { ; HSA-VI-LABEL: name: v2p1i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>), addrspace 4) @@ -2022,9 +2022,9 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: v2p1i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>), align 4, addrspace 4) @@ -2038,9 +2038,9 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { ; HSA-VI-LABEL: name: v2p3i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>), align 16, addrspace 4) @@ -2050,9 +2050,9 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: v2p3i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>), align 4, addrspace 4) @@ -2066,9 +2066,9 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x ptr addrspace(3)> } %arg) nounwind { ; HSA-VI-LABEL: name: v2p1i8_in_struct_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 + ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>), addrspace 4) @@ -2084,9 +2084,9 @@ define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x p ; ; LEGACY-MESA-VI-LABEL: name: v2p1i8_in_struct_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll index eebbe20abd043..6b0e9618754df 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll @@ -4,6 +4,9 @@ define amdgpu_kernel void @system_one_as_acquire() { ; CHECK-LABEL: name: system_one_as_acquire ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") acquire @@ -13,6 +16,9 @@ define amdgpu_kernel void @system_one_as_acquire() { define amdgpu_kernel void @system_one_as_release() { ; CHECK-LABEL: name: system_one_as_release ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") release @@ -22,6 +28,9 @@ define amdgpu_kernel void @system_one_as_release() { define amdgpu_kernel void @system_one_as_acq_rel() { ; CHECK-LABEL: name: system_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") acq_rel @@ -31,6 +40,9 @@ define amdgpu_kernel void @system_one_as_acq_rel() { define amdgpu_kernel void @system_one_as_seq_cst() { ; CHECK-LABEL: name: system_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") seq_cst @@ -40,6 +52,9 @@ define amdgpu_kernel void @system_one_as_seq_cst() { define amdgpu_kernel void @singlethread_one_as_acquire() { ; CHECK-LABEL: name: singlethread_one_as_acquire ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") acquire @@ -49,6 +64,9 @@ define amdgpu_kernel void @singlethread_one_as_acquire() { define amdgpu_kernel void @singlethread_one_as_release() { ; CHECK-LABEL: name: singlethread_one_as_release ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") release @@ -58,6 +76,9 @@ define amdgpu_kernel void @singlethread_one_as_release() { define amdgpu_kernel void @singlethread_one_as_acq_rel() { ; CHECK-LABEL: name: singlethread_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") acq_rel @@ -67,6 +88,9 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel() { define amdgpu_kernel void @singlethread_one_as_seq_cst() { ; CHECK-LABEL: name: singlethread_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") seq_cst @@ -76,6 +100,9 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst() { define amdgpu_kernel void @agent_one_as_acquire() { ; CHECK-LABEL: name: agent_one_as_acquire ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") acquire @@ -85,6 +112,9 @@ define amdgpu_kernel void @agent_one_as_acquire() { define amdgpu_kernel void @agent_one_as_release() { ; CHECK-LABEL: name: agent_one_as_release ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") release @@ -94,6 +124,9 @@ define amdgpu_kernel void @agent_one_as_release() { define amdgpu_kernel void @agent_one_as_acq_rel() { ; CHECK-LABEL: name: agent_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") acq_rel @@ -103,6 +136,9 @@ define amdgpu_kernel void @agent_one_as_acq_rel() { define amdgpu_kernel void @agent_one_as_seq_cst() { ; CHECK-LABEL: name: agent_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") seq_cst @@ -112,6 +148,9 @@ define amdgpu_kernel void @agent_one_as_seq_cst() { define amdgpu_kernel void @workgroup_one_as_acquire() { ; CHECK-LABEL: name: workgroup_one_as_acquire ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") acquire @@ -121,6 +160,9 @@ define amdgpu_kernel void @workgroup_one_as_acquire() { define amdgpu_kernel void @workgroup_one_as_release() { ; CHECK-LABEL: name: workgroup_one_as_release ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") release @@ -130,6 +172,9 @@ define amdgpu_kernel void @workgroup_one_as_release() { define amdgpu_kernel void @workgroup_one_as_acq_rel() { ; CHECK-LABEL: name: workgroup_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") acq_rel @@ -139,6 +184,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() { define amdgpu_kernel void @workgroup_one_as_seq_cst() { ; CHECK-LABEL: name: workgroup_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") seq_cst @@ -148,6 +196,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() { define amdgpu_kernel void @wavefront_one_as_acquire() { ; CHECK-LABEL: name: wavefront_one_as_acquire ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") acquire @@ -157,6 +208,9 @@ define amdgpu_kernel void @wavefront_one_as_acquire() { define amdgpu_kernel void @wavefront_one_as_release() { ; CHECK-LABEL: name: wavefront_one_as_release ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") release @@ -166,6 +220,9 @@ define amdgpu_kernel void @wavefront_one_as_release() { define amdgpu_kernel void @wavefront_one_as_acq_rel() { ; CHECK-LABEL: name: wavefront_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") acq_rel @@ -175,6 +232,9 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel() { define amdgpu_kernel void @wavefront_one_as_seq_cst() { ; CHECK-LABEL: name: wavefront_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") seq_cst @@ -184,6 +244,9 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst() { define amdgpu_kernel void @system_acquire() { ; CHECK-LABEL: name: system_acquire ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: S_ENDPGM 0 entry: ret void @@ -192,6 +255,9 @@ entry: define amdgpu_kernel void @system_release() { ; CHECK-LABEL: name: system_release ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 1 ; CHECK-NEXT: S_ENDPGM 0 fence release @@ -201,6 +267,9 @@ define amdgpu_kernel void @system_release() { define amdgpu_kernel void @system_acq_rel() { ; CHECK-LABEL: name: system_acq_rel ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 1 ; CHECK-NEXT: S_ENDPGM 0 fence acq_rel @@ -210,6 +279,9 @@ define amdgpu_kernel void @system_acq_rel() { define amdgpu_kernel void @system_seq_cst() { ; CHECK-LABEL: name: system_seq_cst ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 1 ; CHECK-NEXT: S_ENDPGM 0 fence seq_cst @@ -219,6 +291,9 @@ define amdgpu_kernel void @system_seq_cst() { define amdgpu_kernel void @singlethread_acquire() { ; CHECK-LABEL: name: singlethread_acquire ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") acquire @@ -228,6 +303,9 @@ define amdgpu_kernel void @singlethread_acquire() { define amdgpu_kernel void @singlethread_release() { ; CHECK-LABEL: name: singlethread_release ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") release @@ -237,6 +315,9 @@ define amdgpu_kernel void @singlethread_release() { define amdgpu_kernel void @singlethread_acq_rel() { ; CHECK-LABEL: name: singlethread_acq_rel ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") acq_rel @@ -246,6 +327,9 @@ define amdgpu_kernel void @singlethread_acq_rel() { define amdgpu_kernel void @singlethread_seq_cst() { ; CHECK-LABEL: name: singlethread_seq_cst ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") seq_cst @@ -255,6 +339,9 @@ define amdgpu_kernel void @singlethread_seq_cst() { define amdgpu_kernel void @agent_acquire() { ; CHECK-LABEL: name: agent_acquire ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") acquire @@ -264,6 +351,9 @@ define amdgpu_kernel void @agent_acquire() { define amdgpu_kernel void @agent_release() { ; CHECK-LABEL: name: agent_release ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") release @@ -273,6 +363,9 @@ define amdgpu_kernel void @agent_release() { define amdgpu_kernel void @agent_acq_rel() { ; CHECK-LABEL: name: agent_acq_rel ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") acq_rel @@ -282,6 +375,9 @@ define amdgpu_kernel void @agent_acq_rel() { define amdgpu_kernel void @agent_seq_cst() { ; CHECK-LABEL: name: agent_seq_cst ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") seq_cst @@ -291,6 +387,9 @@ define amdgpu_kernel void @agent_seq_cst() { define amdgpu_kernel void @workgroup_acquire() { ; CHECK-LABEL: name: workgroup_acquire ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") acquire @@ -300,6 +399,9 @@ define amdgpu_kernel void @workgroup_acquire() { define amdgpu_kernel void @workgroup_release() { ; CHECK-LABEL: name: workgroup_release ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") release @@ -309,6 +411,9 @@ define amdgpu_kernel void @workgroup_release() { define amdgpu_kernel void @workgroup_acq_rel() { ; CHECK-LABEL: name: workgroup_acq_rel ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") acq_rel @@ -318,6 +423,9 @@ define amdgpu_kernel void @workgroup_acq_rel() { define amdgpu_kernel void @workgroup_seq_cst() { ; CHECK-LABEL: name: workgroup_seq_cst ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") seq_cst @@ -327,6 +435,9 @@ define amdgpu_kernel void @workgroup_seq_cst() { define amdgpu_kernel void @wavefront_acquire() { ; CHECK-LABEL: name: wavefront_acquire ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") acquire @@ -336,6 +447,9 @@ define amdgpu_kernel void @wavefront_acquire() { define amdgpu_kernel void @wavefront_release() { ; CHECK-LABEL: name: wavefront_release ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") release @@ -345,6 +459,9 @@ define amdgpu_kernel void @wavefront_release() { define amdgpu_kernel void @wavefront_acq_rel() { ; CHECK-LABEL: name: wavefront_acq_rel ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") acq_rel @@ -354,6 +471,9 @@ define amdgpu_kernel void @wavefront_acq_rel() { define amdgpu_kernel void @wavefront_seq_cst() { ; CHECK-LABEL: name: wavefront_seq_cst ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll index ecad793ad5898..8813462652efd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll @@ -810,14 +810,14 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -932,14 +932,14 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX35]], [[C3]](s32) ; GCN-NEXT: G_STORE [[C1]](s64), [[PTR_ADD2]](p5) :: (store (s64) into %ir.alloca1 + 8, addrspace 5) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @void_fastcc_multi_byval - ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GCN-NEXT: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) + ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[COPY48:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[FRAME_INDEX36:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 @@ -978,14 +978,14 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -1096,14 +1096,14 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C3]](s32) ; GCN-NEXT: G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into %ir.alloca + 8, addrspace 5) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @void_fastcc_byval_and_stack_passed - ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GCN-NEXT: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) + ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[COPY48:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[FRAME_INDEX35:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 @@ -1170,26 +1170,26 @@ define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @i64_fastcc_i64 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64) ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1211,50 +1211,23 @@ entry: ret i64 %ret } -declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) +declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) #1 define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 { ; GCN-LABEL: name: sibling_call_p1i8_fastcc_p1i8 ; GCN: bb.1.entry: - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $vgpr0, $vgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @p1i8_fastcc_p1i8 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1) ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY12]](p4) - ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[COPY13]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY14]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY15]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY16]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY17]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[COPY18]](s32) - ; GCN-NEXT: $vgpr31 = COPY [[COPY19]](s32) - ; GCN-NEXT: SI_TCRETURN [[GV]](p0), @p1i8_fastcc_p1i8, 0, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>) + ; GCN-NEXT: SI_TCRETURN [[GV]](p0), @p1i8_fastcc_p1i8, 0, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 entry: %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a) ret ptr addrspace(1) %ret @@ -1268,25 +1241,25 @@ define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @i16_fastcc_i16 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) ; GCN-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -1315,25 +1288,25 @@ define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @f16_fastcc_f16 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) ; GCN-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -1362,28 +1335,28 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v3i16_fastcc_v3i16 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s16>) ; GCN-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF @@ -1416,26 +1389,26 @@ define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v4i16_fastcc_v4i16 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) ; GCN-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) @@ -1465,14 +1438,14 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -1481,14 +1454,14 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 ; GCN-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY11]](s32), [[COPY12]](s32) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v2i64_fastcc_v2i64 - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>) ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1513,7 +1486,7 @@ entry: } attributes #0 = { nounwind } -attributes #1 = { nounwind noinline } +attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index 2f718814ef77b..c3938e673a6da 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CHECK-LABEL: use_lds_globals: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 ; CHECK-NEXT: ds_read_b32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll index 7587aa0cad2d4..b8b7256011df8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel < %s 2>&1 | FileCheck %s @@ -11,25 +11,25 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p) { ; GCN-LABEL: name: load_zeroinit_lds_global ; GCN: bb.1 (%ir-block.0): - ; GCN: liveins: $sgpr0_sgpr1 - ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 40 + ; GCN: liveins: $sgpr2_sgpr3 + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 40 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @lds - ; GFX6: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 - ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]] + ; GFX8: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def dead $scc + ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]] ; GCN: $m0 = S_MOV_B32 -1 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 0, 0, implicit $m0, implicit $exec - ; GFX8: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 40, 0, implicit $m0, implicit $exec - ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 - ; GFX6: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_LOAD_DWORDX2_IMM]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: BUFFER_STORE_DWORD_OFFSET [[DS_READ_B32_]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec - ; GFX8: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] - ; GFX8: FLAT_STORE_DWORD [[COPY2]], [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX8: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 0, 0, implicit $m0, implicit $exec + ; GFX9: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 40, 0, implicit $m0, implicit $exec + ; GFX8: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GFX8: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 + ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_LOAD_DWORDX2_IMM]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX8: BUFFER_STORE_DWORD_OFFSET [[DS_READ_B32_]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec + ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX9: FLAT_STORE_DWORD [[COPY2]], [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr ; GCN: S_ENDPGM 0 %gep = getelementptr [256 x i32], ptr addrspace(3) @lds, i32 0, i32 10 %ld = load i32, ptr addrspace(3) %gep diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 1a49a38158122..90f34acaa17aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f32_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -48,7 +48,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f32_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -62,7 +62,9 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_div_scale_f32_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -91,7 +93,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -110,7 +112,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f32_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -131,7 +133,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f32_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -145,7 +147,9 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_div_scale_f32_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -174,7 +178,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f64_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -184,7 +188,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -193,12 +197,11 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f64_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 @@ -207,7 +210,9 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -215,31 +220,35 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f64_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -260,7 +269,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f64_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -270,7 +279,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -279,12 +288,11 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f64_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 @@ -293,7 +301,9 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -301,31 +311,35 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f64_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -346,8 +360,8 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], float %a) { ; GFX7-LABEL: test_div_scale_f32_scalar_num_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s8, s[2:3], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -364,8 +378,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_num_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -382,9 +396,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -395,9 +410,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_num_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x54 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x54 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -420,8 +438,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %a) { ; GFX7-LABEL: test_div_scale_f32_scalar_num_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -438,8 +456,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_num_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -456,9 +474,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -469,9 +488,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_num_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -494,8 +516,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) { ; GFX7-LABEL: test_div_scale_f32_scalar_den_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -512,8 +534,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_den_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -530,9 +552,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -543,9 +566,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_den_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -568,8 +594,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) { ; GFX7-LABEL: test_div_scale_f32_scalar_den_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -586,8 +612,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_den_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -604,9 +630,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -617,9 +644,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_den_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -642,8 +672,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %a) { ; GFX7-LABEL: test_div_scale_f64_scalar_num_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -660,8 +690,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_num_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -678,9 +708,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -691,10 +722,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_num_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -717,8 +751,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %a) { ; GFX7-LABEL: test_div_scale_f64_scalar_num_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -735,8 +769,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_num_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -753,9 +787,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -766,10 +801,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_num_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -792,8 +830,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_scalar_den_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -810,8 +848,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_den_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -828,9 +866,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -841,10 +880,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_den_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -867,8 +909,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_scalar_den_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -885,8 +927,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_den_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -903,9 +945,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -916,10 +959,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_den_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -942,25 +988,26 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) { ; GFX7-LABEL: test_div_scale_f32_all_scalar_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x1c +; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s5 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_all_scalar_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x70 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -969,24 +1016,24 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f32_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s5, s5, s4 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_div_scale_f32 v0, s2, s5, s5, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_all_scalar_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 +; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s4 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1000,25 +1047,26 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) { ; GFX7-LABEL: test_div_scale_f32_all_scalar_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x1c +; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s4, v0, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s5, v0, s5 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_all_scalar_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x70 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s1, v0, s1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s3, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1027,24 +1075,24 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f32_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s5, s4 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s5, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_all_scalar_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 +; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s5, s4 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1058,13 +1106,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_all_scalar_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1d -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[6:7] ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1072,13 +1120,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[4:5] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1087,24 +1136,26 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f64_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[4:5], s[4:5], s[0:1] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_all_scalar_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x74 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3] +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[0:1] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1118,13 +1169,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_all_scalar_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1d -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[6:7], v[0:1], s[6:7] ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1132,13 +1183,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[4:5], v[0:1], s[4:5] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1147,24 +1199,26 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f64_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_all_scalar_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x74 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3] +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[4:5], s[0:1] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1178,7 +1232,7 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_inline_imm_num: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1195,7 +1249,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX8-LABEL: test_div_scale_f32_inline_imm_num: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1212,7 +1266,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX10-LABEL: test_div_scale_f32_inline_imm_num: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1224,8 +1278,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX11-LABEL: test_div_scale_f32_inline_imm_num: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1247,7 +1303,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_inline_imm_den: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1264,7 +1320,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX8-LABEL: test_div_scale_f32_inline_imm_den: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1281,7 +1337,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX10-LABEL: test_div_scale_f32_inline_imm_den: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1293,8 +1349,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX11-LABEL: test_div_scale_f32_inline_imm_den: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1316,7 +1374,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_fabs_num: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1336,7 +1394,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: test_div_scale_f32_fabs_num: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1358,7 +1416,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: test_div_scale_f32_fabs_num: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1373,7 +1431,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: test_div_scale_f32_fabs_num: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1406,7 +1466,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_fabs_den: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1426,7 +1486,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: test_div_scale_f32_fabs_den: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1448,7 +1508,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: test_div_scale_f32_fabs_den: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1463,7 +1523,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: test_div_scale_f32_fabs_den: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1496,7 +1558,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_val_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, v0 @@ -1508,8 +1570,8 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; GFX8-LABEL: test_div_scale_f32_val_undef_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1518,7 +1580,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f32_val_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000 @@ -1527,7 +1589,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_div_scale_f32_val_undef_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000 @@ -1544,7 +1606,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_undef_val_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, s0 @@ -1556,8 +1618,8 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; GFX8-LABEL: test_div_scale_f32_undef_val_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1566,7 +1628,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f32_undef_val_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 @@ -1575,7 +1637,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_div_scale_f32_undef_val_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0 @@ -1592,7 +1654,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_undef_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, s0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1602,8 +1664,8 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX8-LABEL: test_div_scale_f32_undef_undef_val: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1612,7 +1674,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX10-LABEL: test_div_scale_f32_undef_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0 @@ -1621,7 +1683,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX11-LABEL: test_div_scale_f32_undef_undef_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0 @@ -1638,7 +1700,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f64_val_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x40200000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1652,8 +1714,8 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x40200000 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1662,8 +1724,8 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f64_val_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], 0x40200000 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1672,7 +1734,7 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX11-LABEL: test_div_scale_f64_val_undef_val: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll index d7b7f03d428bf..2a260823732ca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-LABEL: test_wave32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB0_2 @@ -14,7 +14,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: .LBB0_2: ; %bb -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -25,16 +25,16 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; ; GFX11-LABEL: test_wave32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX11-NEXT: ; %bb.1: ; %mid ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB0_2: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll index 81d8472ebd46e..06393857352b3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-LABEL: test_wave64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 @@ -13,7 +13,7 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB0_2: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index ade6e55b482bb..59818b0b1bc39 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -134,8 +134,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -147,10 +147,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -160,7 +160,7 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN @@ -179,8 +179,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -190,16 +190,16 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll index 752ddbb896c6b..de91c45000f13 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -61,8 +61,8 @@ define void @global_atomic_fadd_f32_off_neg2047(ptr addrspace(1) %ptr, float %da define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr, float %data) { ; GFX908-LABEL: global_atomic_fadd_f32_off_ss: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX908-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v0, s2 @@ -71,8 +71,8 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr, ; ; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll index 1e0cbde7df0db..ec069c10a8d21 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-LABEL: test_wave32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-NEXT: s_load_dword s1, s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[6:7], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 @@ -22,14 +22,14 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX11-LABEL: test_wave32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s2, 0 -; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 -; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11-NEXT: s_and_b32 s0, 1, s0 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll index 9718cef5c6db0..d7a82b415ff06 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) { ; GCN-LABEL: test_wave64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa +; GCN-NEXT: s_load_dword s2, s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xa ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 546376c5962be..69f9a5712b0b5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 @@ -658,7 +658,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 @@ -688,33 +688,33 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX11-NEXT: s_mov_b32 s8, 0x40400000 -; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 +; GFX11-NEXT: s_mov_b32 s8, 0x40400000 +; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s1, 1.0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -742,7 +742,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 @@ -769,7 +769,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) @@ -796,28 +796,29 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s1, 1.0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -846,8 +847,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -875,8 +876,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -888,8 +889,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s2 -; GFX1013-NEXT: v_mov_b32_e32 v1, s3 +; GFX1013-NEXT: v_mov_b32_e32 v0, s0 +; GFX1013-NEXT: v_mov_b32_e32 v1, s1 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: flat_load_dword v2, v[0:1] @@ -903,29 +904,30 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s16, 0xb36211c7 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_movk_i32 s17, 0x102 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v9, s16 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX11-NEXT: v_dual_mov_b32 v10, s17 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s16 -; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 +; GFX11-NEXT: v_mov_b32_e32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v10, s17 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_load_b32 v11, v[0:1] @@ -957,8 +959,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -983,8 +985,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -993,8 +995,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s2 -; GFX1013-NEXT: v_mov_b32_e32 v1, s3 +; GFX1013-NEXT: v_mov_b32_e32 v0, s0 +; GFX1013-NEXT: v_mov_b32_e32 v1, s1 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: flat_load_dword v2, v[0:1] @@ -1008,23 +1010,24 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s12, 0xb36211c6 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_movk_i32 s13, 0x102 ; GFX11-NEXT: s_mov_b32 s6, 2.0 +; GFX11-NEXT: s_movk_i32 s13, 0x102 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v3, s8 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-NEXT: s_mov_b32 s5, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 +; GFX11-NEXT: s_mov_b32 s5, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_load_b32 v8, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll index 5c22d5bdcf744..b0c6e89380d81 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-LABEL: is_private_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: s_load_dword s2, s[4:5], 0x32 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x32 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -26,7 +26,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -39,7 +39,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX10-LABEL: is_private_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -53,13 +53,14 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX11-LABEL: is_private_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -78,9 +79,9 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; CI-LABEL: is_private_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s0, s[4:5], 0x32 +; CI-NEXT: s_load_dword s0, s[6:7], 0x32 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 ; CI-NEXT: s_cbranch_scc1 .LBB1_2 @@ -93,7 +94,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX9-LABEL: is_private_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s1, s3 @@ -107,7 +108,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX10-LABEL: is_private_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s1, s3 @@ -121,7 +122,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX11-LABEL: is_private_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll index e005c38355a3c..bbcb807a956be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-LABEL: is_local_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: s_load_dword s2, s[4:5], 0x33 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x33 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -26,7 +26,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -39,7 +39,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX10-LABEL: is_local_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -53,13 +53,14 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX11-LABEL: is_local_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -78,9 +79,9 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; CI-LABEL: is_local_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s0, s[4:5], 0x33 +; CI-NEXT: s_load_dword s0, s[6:7], 0x33 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 ; CI-NEXT: s_cbranch_scc1 .LBB1_2 @@ -93,7 +94,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX9-LABEL: is_local_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s1, s3 @@ -107,7 +108,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX10-LABEL: is_local_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s1, s3 @@ -121,7 +122,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX11-LABEL: is_local_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll index 7fc9842824b01..1676b69c8c631 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll @@ -4,9 +4,9 @@ ; ALL-LABEL: {{^}}test: ; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1 -; CO-V4: s_load_dword s{{[0-9]+}}, s[4:5], 0xa +; CO-V4: s_load_dword s{{[0-9]+}}, s[8:9], 0xa -; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[4:5], 0xa ; HSA: .amdhsa_kernarg_size 8 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 1 @@ -81,7 +81,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(ptr addrspace(1) %out, ; HSA: .amdhsa_kernarg_size 0 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -define amdgpu_kernel void @test_no_kernargs() #1 { +define amdgpu_kernel void @test_no_kernargs() #4 { %kernarg.segment.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() %gep = getelementptr i32, ptr addrspace(4) %kernarg.segment.ptr, i64 10 %value = load i32, ptr addrspace(4) %gep @@ -126,6 +126,7 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind "amdgpu-implicitarg-num-bytes"="0" } attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" } attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" } +attributes #4 = { nounwind "amdgpu-implicitarg-num-bytes"="0" "amdgpu-no-implicitarg-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index e7faabb72ab69..4d012796693cb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GCN-NEXT: s_mov_b64 s[36:37], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s36, 2 @@ -81,7 +81,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 @@ -127,7 +127,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-NEXT: s_mov_b64 s[6:7], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s6, 2 @@ -157,7 +157,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 @@ -204,7 +204,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-NEXT: s_mov_b64 s[6:7], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s6, 2 @@ -235,11 +235,11 @@ bb: define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_4x4x4f64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 ; GCN-NEXT: s_nop 3 @@ -258,8 +258,8 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -292,11 +292,11 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 @@ -317,8 +317,8 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b64 s[6:7], 1.0 ; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] @@ -352,9 +352,9 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN-NEXT: s_mov_b32 s5, 0x405ec000 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll index c0cd068607200..aa21e67544d65 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -8,10 +8,10 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 @@ -22,22 +22,22 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; encoding: [0x01,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; encoding: [0x01,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x02,0x00] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x00,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; encoding: [0x01,0x01,0x00,0xf4,0x2c,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; encoding: [0x01,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; encoding: [0x80,0x00,0x10,0xca,0x04,0x00,0x00,0x01] ; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] @@ -50,7 +50,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; GFX8-LABEL: mov_dpp64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -63,7 +63,7 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; ; GFX10-LABEL: mov_dpp64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] @@ -75,7 +75,7 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; ; GFX11-LABEL: mov_dpp64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; encoding: [0x02,0x00,0x10,0xca,0x03,0x00,0x00,0x00] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll index 1eb0c2a877425..dd351e193e9e6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll @@ -2,10 +2,20 @@ ; FIXME: Error on non-hsa target -; GCN-LABEL: {{^}}test: +; GCN-LABEL: {{^}}queue_ptr: +; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: .amdhsa_user_sgpr_queue_ptr 1 +define amdgpu_kernel void @queue_ptr(ptr addrspace(1) %out) { + %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 + %value = load i32, ptr addrspace(4) %queue_ptr + store i32 %value, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}queue_ptr_opt: ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN: .amdhsa_user_sgpr_queue_ptr 1 -define amdgpu_kernel void @test(ptr addrspace(1) %out) { +define amdgpu_kernel void @queue_ptr_opt(ptr addrspace(1) %out) #1 { %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 %value = load i32, ptr addrspace(4) %queue_ptr store i32 %value, ptr addrspace(1) %out @@ -15,6 +25,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out) { declare noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 attributes #0 = { nounwind readnone } +attributes #1 = { "amdgpu-no-dispatch-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 6d4aa3b04d761..5a4b4e62bd8ae 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -44,7 +44,7 @@ define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i3 define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 @@ -62,7 +62,7 @@ define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s3, 59, s3 @@ -80,7 +80,7 @@ define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -99,7 +99,7 @@ define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @v_bfe_print_arg(ptr addrspace(1) %out, ptr addrspace(1) %src0) #0 { ; GFX6-LABEL: v_bfe_print_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -118,7 +118,7 @@ define amdgpu_kernel void @v_bfe_print_arg(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_i32 s3, s2, s3 @@ -135,11 +135,11 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_imm_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s3, s3, 8 +; GFX6-NEXT: s_bfe_i32 s3, s4, 8 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_i32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -173,7 +173,7 @@ define amdgpu_kernel void @bfe_i32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -194,7 +194,7 @@ define amdgpu_kernel void @bfe_i32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -215,7 +215,7 @@ define amdgpu_kernel void @bfe_i32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -234,7 +234,7 @@ define amdgpu_kernel void @bfe_i32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @bfe_i32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -272,7 +272,7 @@ define amdgpu_kernel void @bfe_i32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -291,7 +291,7 @@ define amdgpu_kernel void @bfe_i32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -311,7 +311,7 @@ define amdgpu_kernel void @bfe_i32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -331,7 +331,7 @@ define amdgpu_kernel void @bfe_i32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -347,7 +347,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0x302e, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -363,7 +363,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -379,7 +379,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, -1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -411,7 +411,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x10007 ; GFX6-NEXT: s_bfe_i32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -428,7 +428,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_i32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -445,7 +445,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_i32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -462,7 +462,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80006 ; GFX6-NEXT: s_bfe_i32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -479,7 +479,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80010 ; GFX6-NEXT: s_bfe_i32 s2, 0x10000, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -496,7 +496,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_i32 s2, 0xffff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -513,7 +513,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x40004 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -530,7 +530,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -547,7 +547,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_i32 s2, 0x1fffe, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -564,7 +564,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1e0002 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -581,7 +581,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1c0004 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -598,7 +598,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, -1, 0x70001 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -614,7 +614,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_17: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1f0001 ; GFX6-NEXT: s_bfe_i32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -631,7 +631,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_18: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_i32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -648,7 +648,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_sext_in_reg_i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -670,7 +670,7 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -694,7 +694,7 @@ define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr define amdgpu_kernel void @bfe_0_width(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_0_width: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -713,7 +713,7 @@ define amdgpu_kernel void @bfe_0_width(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @bfe_8_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -734,7 +734,7 @@ define amdgpu_kernel void @bfe_8_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @bfe_8_bfe_16(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -756,7 +756,7 @@ define amdgpu_kernel void @bfe_8_bfe_16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @bfe_16_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_16_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -778,7 +778,7 @@ define amdgpu_kernel void @bfe_16_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 @@ -799,7 +799,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(ptr addrspace(1) %out, i32 define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe_wrong: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, s3, 8 @@ -820,7 +820,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(ptr addrspace(1) %out define amdgpu_kernel void @sextload_i8_to_i32_bfe(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -844,7 +844,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -890,7 +890,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(ptr addrspace(1) %out, pt define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -912,7 +912,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(ptr addrspace(1) %out, pt define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i2_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 0c60be9d94591..5074f8814546e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,11 +4,11 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec @@ -23,7 +23,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -43,20 +43,20 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 +; GCN-NEXT: s_load_dword s5, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 56 -; GCN-NEXT: s_cselect_b32 s4, 1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: s_cmp_lg_u32 s4, 56 +; GCN-NEXT: s_cselect_b32 s3, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_mov_b32 s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s3, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 @@ -96,12 +96,12 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -116,7 +116,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0xcccccccd ; GCN-NEXT: s_mov_b32 s5, 0x4010cccc ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -140,12 +140,12 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x10001 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -160,12 +160,12 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -180,7 +180,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 1 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -204,7 +204,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 1.0 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -228,12 +228,12 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -248,7 +248,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -272,7 +272,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -296,7 +296,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -320,7 +320,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -340,11 +340,11 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -359,11 +359,11 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -378,11 +378,11 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -397,11 +397,11 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll index 1d5cc1e1ec046..f3654fea486e0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -40,8 +40,8 @@ define double @v_trig_preop_f64_imm(double %a, i32 %b) { define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -57,8 +57,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; VI-LABEL: s_trig_preop_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -74,8 +74,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; GFX9-LABEL: s_trig_preop_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -86,8 +86,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX10-LABEL: s_trig_preop_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], s2 ; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -97,10 +97,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX11-LABEL: s_trig_preop_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[2:3], s0 +; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[0:1], s2 ; GFX11-NEXT: flat_store_b64 v[0:1], v[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm @@ -112,7 +112,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64_imm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; CI-NEXT: s_add_u32 s0, s0, 4 @@ -127,7 +127,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; VI-LABEL: s_trig_preop_f64_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; VI-NEXT: s_add_u32 s0, s0, 4 @@ -142,7 +142,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX9-LABEL: s_trig_preop_f64_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -151,7 +151,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -160,7 +160,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX11-LABEL: s_trig_preop_f64_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX11-NEXT: flat_store_b64 v[0:1], v[0:1] dlc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll index 43a0f018dc1cd..d7fbec74af385 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -44,7 +44,7 @@ define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i3 define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s3, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -63,7 +63,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 @@ -81,7 +81,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s3, 59, s3 @@ -99,7 +99,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -118,7 +118,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 @@ -135,11 +135,11 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_imm_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s3, 8 +; GFX6-NEXT: s_bfe_u32 s3, s4, 8 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zextload_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +174,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -197,7 +197,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -220,7 +220,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -243,7 +243,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -266,7 +266,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -289,7 +289,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -312,7 +312,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -331,7 +331,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -352,7 +352,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -373,7 +373,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -417,7 +417,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -438,7 +438,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -459,7 +459,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -480,7 +480,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -499,7 +499,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -518,7 +518,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -537,7 +537,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -557,7 +557,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -577,7 +577,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -597,7 +597,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -613,7 +613,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0x302e, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -629,7 +629,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -645,7 +645,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -661,7 +661,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, -1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -677,7 +677,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x10007 ; GFX6-NEXT: s_bfe_u32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -694,7 +694,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_u32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -711,7 +711,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_u32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -728,7 +728,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80006 ; GFX6-NEXT: s_bfe_u32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -745,7 +745,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80010 ; GFX6-NEXT: s_bfe_u32 s2, 0x10000, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -762,7 +762,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_u32 s2, 0xffff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -779,7 +779,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x40004 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -796,7 +796,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -813,7 +813,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_u32 s2, 0x1fffe, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -830,7 +830,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1e0002 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -847,7 +847,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1c0004 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -864,7 +864,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, -1, 0x70001 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -880,7 +880,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_17: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1f0001 ; GFX6-NEXT: s_bfe_u32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -897,7 +897,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_18: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_u32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -918,8 +918,8 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -947,11 +947,11 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -965,7 +965,7 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: v_lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s3, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s3, 7 @@ -983,11 +983,11 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: and_lshr: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1001,11 +1001,11 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: and_lshr2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1019,11 +1019,11 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: shl_lshr: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x150002 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0x150002 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 727184a36c006..2198ba9f1d964 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -19,7 +19,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -30,7 +30,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -46,7 +46,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) { ; GFX8-LABEL: update_dppi64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX10-LABEL: update_dppi64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -78,10 +78,11 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX11-LABEL: update_dppi64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -101,7 +102,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) { ; GFX8-LABEL: update_dppf64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -119,7 +120,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX10-LABEL: update_dppf64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -133,10 +134,11 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX11-LABEL: update_dppf64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -156,7 +158,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) { ; GFX8-LABEL: update_dppv2i32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -174,7 +176,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX10-LABEL: update_dppv2i32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -188,10 +190,11 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX11-LABEL: update_dppv2i32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -211,7 +214,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) { ; GFX8-LABEL: update_dppv2f32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -229,7 +232,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX10-LABEL: update_dppv2f32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -243,10 +246,11 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX11-LABEL: update_dppv2f32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -266,7 +270,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p0_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -284,7 +288,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX10-LABEL: update_dpp_p0_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -298,10 +302,11 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX11-LABEL: update_dpp_p0_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -321,7 +326,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p3_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -336,7 +341,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p3_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -349,7 +354,8 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX11-LABEL: update_dpp_p3_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -371,11 +377,11 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; GFX8-LABEL: update_dpp_p5_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s90, -1 ; GFX8-NEXT: s_mov_b32 s91, 0xe80000 -; GFX8-NEXT: s_add_u32 s88, s88, s3 +; GFX8-NEXT: s_add_u32 s88, s88, s9 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_addc_u32 s89, s89, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -390,26 +396,27 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p5_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 -; GFX10-NEXT: s_add_u32 s4, s4, s3 -; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s14, -1 +; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-NEXT: s_add_u32 s12, s12, s9 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen +; GFX10-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dpp_p5_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll index df201c1903b64..b2546700a935d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll @@ -1,7 +1,8 @@ -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o %t.bc +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll index 09882c446fc0f..d5646820a1983 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll @@ -1,12 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs| FileCheck --check-prefixes=ALL,HSA,UNPACKED %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v4.ll +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v6.ll +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 36bac87889cac..646cb48d37367 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @localize_constants(i1 %cond) { ; GFX9-LABEL: localize_constants: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s1, s1, 1 @@ -95,7 +95,7 @@ bb2: define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-LABEL: localize_globals: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s1, s1, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 2727fdec035d2..66037615f0ba0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -10,7 +10,7 @@ ; Note: we use MIR test checks + stop after legalizer to prevent ; tests from being optimized out. -define amdgpu_kernel void @system_one_as_acquire() { +define amdgpu_kernel void @system_one_as_acquire() #0 { ; GFX6-LABEL: name: system_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -59,7 +59,7 @@ entry: ret void } -define amdgpu_kernel void @system_one_as_release() { +define amdgpu_kernel void @system_one_as_release() #0 { ; GFX6-LABEL: name: system_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -98,7 +98,7 @@ entry: ret void } -define amdgpu_kernel void @system_one_as_acq_rel() { +define amdgpu_kernel void @system_one_as_acq_rel() #0 { ; GFX6-LABEL: name: system_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -147,7 +147,7 @@ entry: ret void } -define amdgpu_kernel void @system_one_as_seq_cst() { +define amdgpu_kernel void @system_one_as_seq_cst() #0 { ; GFX6-LABEL: name: system_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -196,7 +196,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_acquire() { +define amdgpu_kernel void @singlethread_one_as_acquire() #0 { ; GFX6-LABEL: name: singlethread_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -225,7 +225,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_release() { +define amdgpu_kernel void @singlethread_one_as_release() #0 { ; GFX6-LABEL: name: singlethread_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -254,7 +254,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_acq_rel() { +define amdgpu_kernel void @singlethread_one_as_acq_rel() #0 { ; GFX6-LABEL: name: singlethread_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -283,7 +283,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_seq_cst() { +define amdgpu_kernel void @singlethread_one_as_seq_cst() #0 { ; GFX6-LABEL: name: singlethread_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -312,7 +312,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_acquire() { +define amdgpu_kernel void @agent_one_as_acquire() #0 { ; GFX6-LABEL: name: agent_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -361,7 +361,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_release() { +define amdgpu_kernel void @agent_one_as_release() #0 { ; GFX6-LABEL: name: agent_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -400,7 +400,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_acq_rel() { +define amdgpu_kernel void @agent_one_as_acq_rel() #0 { ; GFX6-LABEL: name: agent_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -449,7 +449,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_seq_cst() { +define amdgpu_kernel void @agent_one_as_seq_cst() #0 { ; GFX6-LABEL: name: agent_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -498,7 +498,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_acquire() { +define amdgpu_kernel void @workgroup_one_as_acquire() #0 { ; GFX6-LABEL: name: workgroup_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -533,7 +533,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_release() { +define amdgpu_kernel void @workgroup_one_as_release() #0 { ; GFX6-LABEL: name: workgroup_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -566,7 +566,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_acq_rel() { +define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; GFX6-LABEL: name: workgroup_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -601,7 +601,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_seq_cst() { +define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; GFX6-LABEL: name: workgroup_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -636,7 +636,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_acquire() { +define amdgpu_kernel void @wavefront_one_as_acquire() #0 { ; GFX6-LABEL: name: wavefront_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -665,7 +665,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_release() { +define amdgpu_kernel void @wavefront_one_as_release() #0 { ; GFX6-LABEL: name: wavefront_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -694,7 +694,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_acq_rel() { +define amdgpu_kernel void @wavefront_one_as_acq_rel() #0 { ; GFX6-LABEL: name: wavefront_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -723,7 +723,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_seq_cst() { +define amdgpu_kernel void @wavefront_one_as_seq_cst() #0 { ; GFX6-LABEL: name: wavefront_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -752,7 +752,7 @@ entry: ret void } -define amdgpu_kernel void @system_acquire() { +define amdgpu_kernel void @system_acquire() #0 { ; GFX6-LABEL: name: system_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -801,7 +801,7 @@ entry: ret void } -define amdgpu_kernel void @system_release() { +define amdgpu_kernel void @system_release() #0 { ; GFX6-LABEL: name: system_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -840,7 +840,7 @@ entry: ret void } -define amdgpu_kernel void @system_acq_rel() { +define amdgpu_kernel void @system_acq_rel() #0 { ; GFX6-LABEL: name: system_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -889,7 +889,7 @@ entry: ret void } -define amdgpu_kernel void @system_seq_cst() { +define amdgpu_kernel void @system_seq_cst() #0 { ; GFX6-LABEL: name: system_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -938,7 +938,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_acquire() { +define amdgpu_kernel void @singlethread_acquire() #0 { ; GFX6-LABEL: name: singlethread_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -967,7 +967,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_release() { +define amdgpu_kernel void @singlethread_release() #0 { ; GFX6-LABEL: name: singlethread_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -996,7 +996,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_acq_rel() { +define amdgpu_kernel void @singlethread_acq_rel() #0 { ; GFX6-LABEL: name: singlethread_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1025,7 +1025,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_seq_cst() { +define amdgpu_kernel void @singlethread_seq_cst() #0 { ; GFX6-LABEL: name: singlethread_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1054,7 +1054,7 @@ entry: ret void } -define amdgpu_kernel void @agent_acquire() { +define amdgpu_kernel void @agent_acquire() #0 { ; GFX6-LABEL: name: agent_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -1103,7 +1103,7 @@ entry: ret void } -define amdgpu_kernel void @agent_release() { +define amdgpu_kernel void @agent_release() #0 { ; GFX6-LABEL: name: agent_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -1142,7 +1142,7 @@ entry: ret void } -define amdgpu_kernel void @agent_acq_rel() { +define amdgpu_kernel void @agent_acq_rel() #0 { ; GFX6-LABEL: name: agent_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -1191,7 +1191,7 @@ entry: ret void } -define amdgpu_kernel void @agent_seq_cst() { +define amdgpu_kernel void @agent_seq_cst() #0 { ; GFX6-LABEL: name: agent_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -1240,7 +1240,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_acquire() { +define amdgpu_kernel void @workgroup_acquire() #0 { ; GFX6-LABEL: name: workgroup_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 @@ -1279,7 +1279,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_release() { +define amdgpu_kernel void @workgroup_release() #0 { ; GFX6-LABEL: name: workgroup_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 @@ -1316,7 +1316,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_acq_rel() { +define amdgpu_kernel void @workgroup_acq_rel() #0 { ; GFX6-LABEL: name: workgroup_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 @@ -1355,7 +1355,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_seq_cst() { +define amdgpu_kernel void @workgroup_seq_cst() #0 { ; GFX6-LABEL: name: workgroup_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 @@ -1394,7 +1394,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_acquire() { +define amdgpu_kernel void @wavefront_acquire() #0 { ; GFX6-LABEL: name: wavefront_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1423,7 +1423,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_release() { +define amdgpu_kernel void @wavefront_release() #0 { ; GFX6-LABEL: name: wavefront_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1452,7 +1452,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_acq_rel() { +define amdgpu_kernel void @wavefront_acq_rel() #0 { ; GFX6-LABEL: name: wavefront_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1481,7 +1481,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_seq_cst() { +define amdgpu_kernel void @wavefront_seq_cst() #0 { ; GFX6-LABEL: name: wavefront_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1509,3 +1509,5 @@ entry: fence syncscope("wavefront") seq_cst ret void } + +attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 1140ef88ac7f8..577a7d0b4cba0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; GFX10-LABEL: v_mul_i64_no_zext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -23,7 +23,9 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_mul_i64_no_zext: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -56,13 +58,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_mul_i64_zext_src1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX10-NEXT: global_load_dword v4, v3, s[2:3] +; GFX10-NEXT: global_load_dword v4, v3, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v0, v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 @@ -75,8 +77,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_mul_i64_zext_src1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -108,13 +112,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_mul_i64_zext_src0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 @@ -127,8 +131,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_mul_i64_zext_src0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -160,13 +166,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_zext_src0_src1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -176,10 +182,12 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_zext_src0_src1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] @@ -207,13 +215,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src0_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 @@ -226,8 +234,10 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_masked_src0_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -259,13 +269,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src0_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -276,8 +286,10 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_masked_src0_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -307,16 +319,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src1_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: ; kill: killed $vgpr3 ; GFX10-NEXT: ; kill: killed $sgpr6_sgpr7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] -; GFX10-NEXT: ; kill: killed $sgpr2_sgpr3 +; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX10-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -326,8 +338,10 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_masked_src1_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -355,7 +369,7 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; GFX10-LABEL: v_mul_i64_masked_src0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -365,7 +379,7 @@ define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_mul_i64_masked_src0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -389,13 +403,13 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX10-LABEL: v_mul_i64_partially_masked_src0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -412,8 +426,10 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-LABEL: v_mul_i64_partially_masked_src0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -450,7 +466,7 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; GFX10-LABEL: v_mul64_masked_before_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -460,7 +476,7 @@ define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_mul64_masked_before_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -498,13 +514,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX10-LABEL: v_mul64_masked_before_and_in_branch: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1] ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] @@ -533,8 +549,10 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-LABEL: v_mul64_masked_before_and_in_branch: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 2d81452f9ef38..b0f3eee3c7363 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2542,7 +2542,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: s_mul_u64_zext_with_sregs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2559,7 +2559,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: s_mul_u64_zext_with_sregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -2576,7 +2576,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: s_mul_u64_zext_with_sregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2590,7 +2590,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: s_mul_u64_zext_with_sregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2604,7 +2604,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: s_mul_u64_zext_with_sregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0 @@ -2619,7 +2619,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: s_mul_u64_zext_with_sregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -2718,7 +2718,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: s_mul_u64_sext_with_sregs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2738,7 +2738,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: s_mul_u64_sext_with_sregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -2758,7 +2758,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: s_mul_u64_sext_with_sregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2775,7 +2775,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: s_mul_u64_sext_with_sregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -2792,7 +2792,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: s_mul_u64_sext_with_sregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: s_mul_u64_sext_with_sregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index eaaeb3dc77a41..c7afbeabbbb6b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -13,33 +13,33 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_load_dword s4, s[6:7], 0x8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_load_dword s6, s[4:5], 0xc +; GCN-NEXT: s_load_dword s4, s[6:7], 0xc ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_load_dword s7, s[4:5], 0x10 -; GCN-NEXT: s_add_u32 s6, s32, 0x1000 +; GCN-NEXT: s_load_dword s5, s[6:7], 0x10 +; GCN-NEXT: s_add_u32 s4, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s7, s7, 2 -; GCN-NEXT: s_add_u32 s6, s6, s7 +; GCN-NEXT: s_lshl_b32 s5, s5, 2 +; GCN-NEXT: s_add_u32 s4, s4, s5 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -84,29 +84,29 @@ bb.2: define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_load_dword s4, s[6:7], 0x8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_load_dword s6, s[4:5], 0xc -; GCN-NEXT: s_add_u32 s7, s32, 0x1000 -; GCN-NEXT: s_and_b32 s7, s7, 0xfffff000 +; GCN-NEXT: s_load_dword s4, s[6:7], 0xc +; GCN-NEXT: s_add_u32 s5, s32, 0x1000 +; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s6, s6, 2 +; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: s_add_u32 s6, s7, s6 +; GCN-NEXT: s_add_u32 s4, s5, s4 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index b666f45521661..cf69c50ed9357 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -6,83 +6,83 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s8, s7, 31 -; GFX8-NEXT: s_add_i32 s0, s7, s8 -; GFX8-NEXT: s_xor_b32 s7, s0, s8 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: s_ashr_i32 s8, s5, 31 +; GFX8-NEXT: s_add_i32 s0, s5, s8 +; GFX8-NEXT: s_xor_b32 s5, s0, s8 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_ashr_i32 s4, s6, 31 -; GFX8-NEXT: s_add_i32 s5, s6, s4 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_ashr_i32 s6, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s5, s5, s4 -; GFX8-NEXT: s_xor_b32 s6, s4, s8 +; GFX8-NEXT: s_xor_b32 s4, s4, s6 +; GFX8-NEXT: s_xor_b32 s7, s6, s8 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s5, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s5, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s4, s8, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -95,16 +95,17 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s1, s1, s4 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s7, s1, s6 +; GFX10-NEXT: s_xor_b32 s5, s1, s4 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: s_sub_i32 s1, 0, s5 +; GFX10-NEXT: s_xor_b32 s4, s8, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -112,18 +113,17 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_xor_b32 s4, s8, s6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -145,7 +145,7 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: sdivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_ashr_i32 s12, s11, 31 @@ -305,7 +305,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s9, 31 ; GFX9-NEXT: s_ashr_i32 s12, s11, 31 @@ -459,7 +459,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s2, s9, 31 ; GFX10-NEXT: s_ashr_i32 s12, s11, 31 @@ -616,7 +616,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 ; GFX8-NEXT: s_add_i32 s0, s10, s2 @@ -692,7 +692,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s8, s6, 31 ; GFX9-NEXT: s_add_i32 s6, s6, s8 @@ -765,7 +765,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s1, s10, 31 ; GFX10-NEXT: s_ashr_i32 s2, s11, 31 @@ -845,8 +845,8 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s12, 31 ; GFX8-NEXT: s_add_i32 s0, s12, s2 @@ -986,19 +986,19 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s6, s12, 31 -; GFX9-NEXT: s_add_i32 s0, s12, s6 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_ashr_i32 s4, s13, 31 -; GFX9-NEXT: s_add_i32 s5, s13, s4 +; GFX9-NEXT: s_ashr_i32 s4, s12, 31 +; GFX9-NEXT: s_add_i32 s0, s12, s4 +; GFX9-NEXT: s_xor_b32 s5, s0, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_ashr_i32 s6, s13, 31 +; GFX9-NEXT: s_add_i32 s7, s13, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX9-NEXT: s_sub_i32 s13, 0, s7 +; GFX9-NEXT: s_xor_b32 s7, s7, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s13, 0, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_xor_b32 s8, s8, s12 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_sub_i32 s13, 0, s5 +; GFX9-NEXT: s_sub_i32 s13, 0, s7 ; GFX9-NEXT: v_mul_lo_u32 v3, s13, v1 ; GFX9-NEXT: s_ashr_i32 s13, s9, 31 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 @@ -1017,62 +1017,62 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX9-NEXT: s_add_i32 s9, s9, s13 ; GFX9-NEXT: s_xor_b32 s9, s9, s13 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v2 -; GFX9-NEXT: s_xor_b32 s6, s12, s6 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v2 +; GFX9-NEXT: s_xor_b32 s4, s12, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: s_ashr_i32 s6, s14, 31 -; GFX9-NEXT: s_add_i32 s7, s14, s6 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 +; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_ashr_i32 s4, s14, 31 +; GFX9-NEXT: s_add_i32 s5, s14, s4 ; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX9-NEXT: s_xor_b32 s7, s7, s6 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 ; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 -; GFX9-NEXT: s_sub_i32 s8, 0, s7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: s_sub_i32 s8, 0, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX9-NEXT: s_xor_b32 s4, s13, s4 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: s_ashr_i32 s4, s15, 31 -; GFX9-NEXT: s_add_i32 s9, s15, s4 +; GFX9-NEXT: s_xor_b32 s6, s13, s6 +; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 +; GFX9-NEXT: s_ashr_i32 s6, s15, 31 +; GFX9-NEXT: s_add_i32 s9, s15, s6 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: s_xor_b32 s9, s9, s4 +; GFX9-NEXT: s_xor_b32 s9, s9, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s9 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 -; GFX9-NEXT: s_ashr_i32 s5, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s5 -; GFX9-NEXT: s_xor_b32 s8, s8, s5 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: s_ashr_i32 s7, s10, 31 +; GFX9-NEXT: s_add_i32 s8, s10, s7 +; GFX9-NEXT: s_xor_b32 s8, s8, s7 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s7 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s5 ; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v2 @@ -1080,27 +1080,27 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_sub_i32 s8, 0, s9 ; GFX9-NEXT: v_mul_lo_u32 v8, s8, v7 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 -; GFX9-NEXT: s_ashr_i32 s7, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s7 -; GFX9-NEXT: s_xor_b32 s8, s8, s7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: s_ashr_i32 s5, s11, 31 +; GFX9-NEXT: s_add_i32 s8, s11, s5 +; GFX9-NEXT: s_xor_b32 s8, s8, s5 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_mul_hi_u32 v7, s8, v7 -; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: s_xor_b32 s4, s7, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v7, s9 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: s_xor_b32 s4, s7, s4 -; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 +; GFX9-NEXT: s_xor_b32 s4, s5, s6 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc @@ -1112,12 +1112,12 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, s5, v6 +; GFX9-NEXT: v_xor_b32_e32 v6, s7, v6 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, s5, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] @@ -1125,18 +1125,18 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s0, s12, 31 ; GFX10-NEXT: s_ashr_i32 s1, s13, 31 ; GFX10-NEXT: s_ashr_i32 s2, s14, 31 ; GFX10-NEXT: s_ashr_i32 s3, s15, 31 -; GFX10-NEXT: s_add_i32 s6, s12, s0 -; GFX10-NEXT: s_add_i32 s7, s13, s1 +; GFX10-NEXT: s_add_i32 s4, s12, s0 +; GFX10-NEXT: s_add_i32 s5, s13, s1 ; GFX10-NEXT: s_add_i32 s12, s14, s2 ; GFX10-NEXT: s_add_i32 s13, s15, s3 -; GFX10-NEXT: s_xor_b32 s14, s6, s0 -; GFX10-NEXT: s_xor_b32 s15, s7, s1 +; GFX10-NEXT: s_xor_b32 s14, s4, s0 +; GFX10-NEXT: s_xor_b32 s15, s5, s1 ; GFX10-NEXT: s_xor_b32 s12, s12, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX10-NEXT: s_xor_b32 s13, s13, s3 @@ -1144,11 +1144,11 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s14 +; GFX10-NEXT: s_sub_i32 s4, 0, s14 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: s_sub_i32 s7, 0, s15 +; GFX10-NEXT: s_sub_i32 s5, 0, s15 ; GFX10-NEXT: s_sub_i32 s19, 0, s12 ; GFX10-NEXT: s_ashr_i32 s16, s8, 31 ; GFX10-NEXT: s_ashr_i32 s17, s9, 31 @@ -1163,22 +1163,22 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s13 -; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 +; GFX10-NEXT: v_mul_lo_u32 v4, s4, v0 +; GFX10-NEXT: s_sub_i32 s4, 0, s13 +; GFX10-NEXT: v_mul_lo_u32 v5, s5, v1 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX10-NEXT: v_mul_lo_u32 v7, s4, v3 ; GFX10-NEXT: s_ashr_i32 s19, s11, 31 -; GFX10-NEXT: s_add_i32 s6, s8, s16 -; GFX10-NEXT: s_add_i32 s7, s9, s17 +; GFX10-NEXT: s_add_i32 s4, s8, s16 +; GFX10-NEXT: s_add_i32 s5, s9, s17 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 ; GFX10-NEXT: s_add_i32 s8, s10, s18 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 ; GFX10-NEXT: s_add_i32 s9, s11, s19 -; GFX10-NEXT: s_xor_b32 s10, s6, s16 -; GFX10-NEXT: s_xor_b32 s11, s7, s17 +; GFX10-NEXT: s_xor_b32 s10, s4, s16 +; GFX10-NEXT: s_xor_b32 s11, s5, s17 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 ; GFX10-NEXT: s_xor_b32 s8, s8, s18 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 ; GFX10-NEXT: s_xor_b32 s22, s18, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 ; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 ; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 @@ -1271,8 +1271,8 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s4, s13, 31 ; GFX8-NEXT: s_ashr_i32 s6, s1, 31 @@ -1582,8 +1582,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s13, 31 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 @@ -1885,8 +1885,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-LABEL: sdivrem_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s16, s1, 31 ; GFX10-NEXT: s_ashr_i32 s4, s13, 31 @@ -2187,25 +2187,25 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s6, 0x80008 -; GFX8-NEXT: s_ashr_i32 s7, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s7 -; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX8-NEXT: s_ashr_i32 s5, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_xor_b32 s8, s0, s5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_sext_i32_i8 s4, s6 -; GFX8-NEXT: s_ashr_i32 s5, s4, 31 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_ashr_i32 s6, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_add_i32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s6, s5, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s6 +; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2222,52 +2222,52 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x80008 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s5, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_sext_i32_i8 s0, s0 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s4, s8, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -2280,18 +2280,19 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: sdiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s1, s1, s4 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s7, s1, s6 +; GFX10-NEXT: s_xor_b32 s5, s1, s4 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: s_sub_i32 s1, 0, s5 +; GFX10-NEXT: s_xor_b32 s4, s8, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2299,18 +2300,17 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_xor_b32 s4, s8, s6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2332,14 +2332,14 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: sdivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_xor_b32 s8, s0, s3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s6, 0, s8 +; GFX8-NEXT: s_sub_i32 s4, 0, s8 ; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80018 ; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2351,10 +2351,10 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s9, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s9 -; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s9 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2420,45 +2420,45 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s6, 0x80010 -; GFX9-NEXT: s_ashr_i32 s7, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s7 -; GFX9-NEXT: s_xor_b32 s8, s0, s7 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80010 +; GFX9-NEXT: s_ashr_i32 s5, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s5 +; GFX9-NEXT: s_xor_b32 s8, s0, s5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_bfe_i32 s5, s6, 0x80018 -; GFX9-NEXT: s_ashr_i32 s9, s5, 31 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_bfe_i32 s7, s4, 0x80018 +; GFX9-NEXT: s_ashr_i32 s9, s7, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s5, s5, s9 -; GFX9-NEXT: s_xor_b32 s5, s5, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_add_i32 s7, s7, s9 +; GFX9-NEXT: s_xor_b32 s7, s7, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s10, 0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i8 s4, s6 +; GFX9-NEXT: s_sext_i32_i8 s6, s4 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s4, 31 +; GFX9-NEXT: s_ashr_i32 s10, s6, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s4, s4, s10 -; GFX9-NEXT: s_xor_b32 s4, s4, s10 -; GFX9-NEXT: s_sub_i32 s11, 0, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s10 +; GFX9-NEXT: s_xor_b32 s6, s6, s10 +; GFX9-NEXT: s_sub_i32 s11, 0, s7 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 -; GFX9-NEXT: s_bfe_i32 s6, s6, 0x80008 -; GFX9-NEXT: s_ashr_i32 s11, s6, 31 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x80008 +; GFX9-NEXT: s_ashr_i32 s11, s4, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s11 +; GFX9-NEXT: s_add_i32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX9-NEXT: s_xor_b32 s4, s6, s11 +; GFX9-NEXT: s_xor_b32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 @@ -2469,25 +2469,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s6, s10, s7 -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX9-NEXT: s_xor_b32 s5, s10, s5 +; GFX9-NEXT: v_xor_b32_e32 v0, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 @@ -2505,7 +2505,7 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80018 ; GFX10-NEXT: s_bfe_i32 s3, s0, 0x80010 @@ -2517,36 +2517,36 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: s_xor_b32 s3, s3, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: s_sub_i32 s4, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s3 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: s_bfe_i32 s6, s0, 0x80008 +; GFX10-NEXT: v_mul_lo_u32 v2, s4, v0 +; GFX10-NEXT: s_sub_i32 s4, 0, s3 +; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX10-NEXT: s_bfe_i32 s4, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s9, s6, 31 +; GFX10-NEXT: s_ashr_i32 s9, s4, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s6, s6, s9 +; GFX10-NEXT: s_add_i32 s4, s4, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s6, s6, s9 +; GFX10-NEXT: s_xor_b32 s4, s4, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 @@ -2596,25 +2596,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s6, 0x100010 -; GFX8-NEXT: s_ashr_i32 s7, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s7 -; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010 +; GFX8-NEXT: s_ashr_i32 s5, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_xor_b32 s8, s0, s5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_sext_i32_i16 s4, s6 -; GFX8-NEXT: s_ashr_i32 s5, s4, 31 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_ashr_i32 s6, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_add_i32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s6, s5, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s6 +; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2631,52 +2631,52 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x100010 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s5, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s4, s8, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -2689,18 +2689,19 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: sdiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x100010 ; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s1, s1, s4 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s7, s1, s6 +; GFX10-NEXT: s_xor_b32 s5, s1, s4 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: s_sub_i32 s1, 0, s5 +; GFX10-NEXT: s_xor_b32 s4, s8, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2708,18 +2709,17 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_xor_b32 s4, s8, s6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2741,14 +2741,14 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s8, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s8 ; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX8-NEXT: s_sub_i32 s6, 0, s9 +; GFX8-NEXT: s_sub_i32 s4, 0, s9 ; GFX8-NEXT: s_bfe_i32 s1, s3, 0x100010 ; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2760,10 +2760,10 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2829,15 +2829,15 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s7 +; GFX9-NEXT: s_sext_i32_i16 s0, s5 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_bfe_i32 s5, s7, 0x100010 +; GFX9-NEXT: s_bfe_i32 s5, s5, 0x100010 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: s_ashr_i32 s7, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s5, s5, s7 @@ -2847,27 +2847,27 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s10, 0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i16 s4, s6 +; GFX9-NEXT: s_sext_i32_i16 s6, s4 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s4, 31 +; GFX9-NEXT: s_ashr_i32 s10, s6, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s4, s4, s10 -; GFX9-NEXT: s_xor_b32 s4, s4, s10 +; GFX9-NEXT: s_add_i32 s6, s6, s10 +; GFX9-NEXT: s_xor_b32 s6, s6, s10 ; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 -; GFX9-NEXT: s_bfe_i32 s6, s6, 0x100010 -; GFX9-NEXT: s_ashr_i32 s11, s6, 31 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x100010 +; GFX9-NEXT: s_ashr_i32 s11, s4, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s11 +; GFX9-NEXT: s_add_i32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: s_xor_b32 s4, s6, s11 +; GFX9-NEXT: s_xor_b32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s1 ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x100010 @@ -2924,36 +2924,36 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: s_xor_b32 s1, s1, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX10-NEXT: s_sub_i32 s6, 0, s2 +; GFX10-NEXT: s_sub_i32 s4, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: s_sext_i32_i16 s6, s0 +; GFX10-NEXT: v_mul_lo_u32 v2, s4, v0 +; GFX10-NEXT: s_sub_i32 s4, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX10-NEXT: s_sext_i32_i16 s4, s0 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX10-NEXT: s_ashr_i32 s9, s6, 31 +; GFX10-NEXT: s_ashr_i32 s9, s4, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s6, s6, s9 +; GFX10-NEXT: s_add_i32 s4, s4, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s6, s6, s9 +; GFX10-NEXT: s_xor_b32 s4, s4, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 @@ -3002,25 +3002,25 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s6, 0x30008 -; GFX8-NEXT: s_ashr_i32 s7, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s7 -; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008 +; GFX8-NEXT: s_ashr_i32 s5, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_xor_b32 s8, s0, s5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x30000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_bfe_i32 s4, s6, 0x30000 -; GFX8-NEXT: s_ashr_i32 s5, s4, 31 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_ashr_i32 s6, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_add_i32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s6, s5, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s6 +; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3037,12 +3037,12 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 @@ -3052,39 +3052,39 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: sdivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x30008 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s5, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x30000 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s4, s8, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -3099,18 +3099,19 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: sdivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x30008 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x30000 -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 -; GFX10-NEXT: s_ashr_i32 s7, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s6 -; GFX10-NEXT: s_add_i32 s0, s0, s7 -; GFX10-NEXT: s_xor_b32 s1, s1, s6 -; GFX10-NEXT: s_xor_b32 s0, s0, s7 +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s5, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s0, s0, s5 +; GFX10-NEXT: s_xor_b32 s1, s1, s4 +; GFX10-NEXT: s_xor_b32 s0, s0, s5 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: s_sub_i32 s2, 0, s1 +; GFX10-NEXT: s_xor_b32 s4, s5, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -3128,15 +3129,14 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_xor_b32 s4, s7, s6 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 7, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3153,25 +3153,25 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s7, 0x1b0000 -; GFX8-NEXT: s_ashr_i32 s7, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s7 -; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s5, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_xor_b32 s8, s0, s5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x1b0000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_bfe_i32 s4, s6, 0x1b0000 -; GFX8-NEXT: s_ashr_i32 s5, s4, 31 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_ashr_i32 s6, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_add_i32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s4, s4, s5 -; GFX8-NEXT: s_xor_b32 s6, s5, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s6 +; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3188,12 +3188,12 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 @@ -3203,39 +3203,39 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s1, 0x1b0000 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s5, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x1b0000 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s4, s8, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -3250,18 +3250,19 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x1b0000 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x1b0000 -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 -; GFX10-NEXT: s_ashr_i32 s7, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s6 -; GFX10-NEXT: s_add_i32 s0, s0, s7 -; GFX10-NEXT: s_xor_b32 s1, s1, s6 -; GFX10-NEXT: s_xor_b32 s0, s0, s7 +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s5, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s0, s0, s5 +; GFX10-NEXT: s_xor_b32 s1, s1, s4 +; GFX10-NEXT: s_xor_b32 s0, s0, s5 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: s_sub_i32 s2, 0, s1 +; GFX10-NEXT: s_xor_b32 s4, s5, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -3279,15 +3280,14 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_xor_b32 s4, s7, s6 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 7ad19a4797003..3729f1cc2b12d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -238,7 +238,7 @@ define i64 @v_shl_i64_sext_i32_overflow(i32 %x) { define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX7-LABEL: mulu24_shl64: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -251,7 +251,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX8-LABEL: mulu24_shl64: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -266,7 +266,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: mulu24_shl64: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -281,7 +281,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX10-LABEL: mulu24_shl64: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0 @@ -296,7 +296,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX11-LABEL: mulu24_shl64: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 6, v0 ; GFX11-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] @@ -321,7 +321,7 @@ bb: define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) { ; GFX7-LABEL: muli24_shl64: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -340,7 +340,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX8-LABEL: muli24_shl64: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -363,7 +363,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX9-LABEL: muli24_shl64: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -378,7 +378,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX10-LABEL: muli24_shl64: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -393,16 +393,17 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX11-LABEL: muli24_shl64: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v1, 0xff800000, v1 -; GFX11-NEXT: v_mul_i32_i24_e32 v1, -7, v1 -; GFX11-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] -; GFX11-NEXT: global_store_b64 v0, v[1:2], s[0:1] +; GFX11-NEXT: v_or_b32_e32 v0, 0xff800000, v0 +; GFX11-NEXT: v_mul_i32_i24_e32 v0, -7, v0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 99aaec458c33e..2d85081f5fc96 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -10,21 +10,21 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -38,22 +38,22 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX10-LABEL: store_lds_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -67,21 +67,21 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: s_lshr_b32 s0, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8 @@ -123,8 +123,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 @@ -177,29 +177,29 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_lshr_b32 s0, s5, 16 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s1, s1, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: s_lshr_b32 s2, s2, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: s_lshr_b32 s4, s6, 16 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s0, s3, 8 -; GFX10-NEXT: s_lshr_b32 s3, s2, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshr_b32 s1, s3, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: s_lshr_b32 s0, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s0 +; GFX10-NEXT: v_mov_b32_e32 v8, s1 ; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 @@ -209,7 +209,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v10, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, s0 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 @@ -234,8 +234,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 @@ -289,14 +289,14 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -317,8 +317,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s1, s4, 16 @@ -347,20 +347,20 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: s_lshr_b32 s1, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s1, s5, 16 +; GFX10-NEXT: s_lshr_b32 s0, s5, 16 ; GFX10-NEXT: s_lshr_b32 s2, s6, 16 ; GFX10-NEXT: s_lshr_b32 s3, s7, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: ds_write_b16 v1, v0 @@ -376,8 +376,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 @@ -404,11 +404,11 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -418,8 +418,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -434,11 +434,11 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 @@ -449,8 +449,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 @@ -465,21 +465,21 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -493,11 +493,11 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 @@ -508,8 +508,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -523,21 +523,21 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -551,22 +551,22 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX10-LABEL: store_lds_v4i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 0f9ec965f2f0f..4ef79b752c437 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -10,20 +10,20 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -36,21 +36,21 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) ; GFX10-LABEL: store_lds_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 @@ -63,21 +63,21 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: s_lshr_b32 s0, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8 @@ -107,8 +107,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 @@ -150,32 +150,32 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, 16 +; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_lshr_b32 s0, s5, 16 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_lshr_b32 s4, s6, 16 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: s_lshr_b32 s0, s3, 8 -; GFX10-NEXT: s_lshr_b32 s3, s2, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: s_lshr_b32 s1, s3, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s2, s2, 8 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: s_lshr_b32 s0, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v9, s3 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v8, s0 -; GFX10-NEXT: v_mov_b32_e32 v10, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, s0 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: v_mov_b32_e32 v7, s6 +; GFX10-NEXT: v_mov_b32_e32 v8, s1 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 @@ -195,8 +195,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 @@ -237,14 +237,14 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -260,8 +260,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s1, s4, 16 @@ -285,18 +285,18 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_lshr_b32 s1, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s1, s5, 16 +; GFX10-NEXT: s_lshr_b32 s0, s5, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: s_lshr_b32 s2, s6, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: ds_write_b16 v1, v0 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 @@ -309,8 +309,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 @@ -333,11 +333,11 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -346,8 +346,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -361,11 +361,11 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -375,8 +375,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 @@ -390,11 +390,11 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -403,8 +403,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -418,11 +418,11 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -432,8 +432,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 @@ -447,20 +447,20 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -473,21 +473,21 @@ define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i ; GFX10-LABEL: store_lds_v3i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index a58397eccaba7..8b94f93e44e56 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -6,32 +6,32 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s6, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s6, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -41,30 +41,30 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s0, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s0, 0, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s6, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -73,28 +73,28 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s0, 0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: s_sub_i32 s0, 0, s5 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s6, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -112,7 +112,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: udivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -251,7 +251,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -384,7 +384,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -522,7 +522,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -576,7 +576,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -627,7 +627,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 @@ -685,8 +685,8 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -783,7 +783,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -792,6 +792,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s14 +; GFX9-NEXT: s_sub_i32 s4, 0, s14 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -799,8 +800,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_sub_i32 s4, 0, s14 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 @@ -878,9 +878,9 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -979,8 +979,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1248,7 +1248,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1510,7 +1510,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 @@ -1546,9 +1546,9 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4] ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0 ; GFX10-NEXT: s_subb_u32 s3, 0, s15 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, s1, v7, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s3, v8, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6] ; GFX10-NEXT: v_mul_lo_u32 v1, v10, v2 ; GFX10-NEXT: v_mul_hi_u32 v5, v8, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, v10, v2 @@ -1560,39 +1560,39 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 ; GFX10-NEXT: v_mul_hi_u32 v3, v9, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0 -; GFX10-NEXT: v_add_co_u32 v6, s6, v6, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v11, s6, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v2, s6, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v4, s6, v6, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v6, s6, v11, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v5 +; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v2, s4, v16, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 -; GFX10-NEXT: v_add_co_u32 v4, s6, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v15, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v16, v5 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v4 -; GFX10-NEXT: v_add_co_u32 v1, s6, v2, v1 +; GFX10-NEXT: v_add_co_u32 v1, s4, v2, v1 ; GFX10-NEXT: v_add3_u32 v3, v11, v6, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v3, vcc_lo ; GFX10-NEXT: v_add3_u32 v2, v5, v2, v0 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s0, v7, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s0, v7, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s6, s2, v8, 0 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, s2, v8, 0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4] @@ -1772,34 +1772,34 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s7, s6, 0x80008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s4, s6, 0xff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -1809,32 +1809,32 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s6 -; GFX9-NEXT: s_and_b32 s7, s0, 0xff +; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 0xff ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] @@ -1843,12 +1843,12 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: udiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s1, 0, s6 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s1, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1856,17 +1856,17 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -1884,8 +1884,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: udivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -1949,55 +1949,55 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80010 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s4 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s7 +; GFX9-NEXT: s_sub_i32 s2, 0, s5 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: s_and_b32 s8, s0, 0xff ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s4 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s5 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2012,7 +2012,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80010 @@ -2020,7 +2020,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: s_sub_i32 s4, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -2029,8 +2029,8 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2081,34 +2081,34 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s7, s6, 16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: s_lshr_b32 s5, s4, 16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s4, s6, 0xffff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -2118,32 +2118,32 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_sub_i32 s1, 0, s6 -; GFX9-NEXT: s_and_b32 s7, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 0xffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v2, v0, s[0:1] @@ -2152,12 +2152,12 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: udiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s1, 0, s6 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s1, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2165,17 +2165,17 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -2193,8 +2193,8 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: s_sub_i32 s1, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s2 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2274,10 +2274,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2319,14 +2319,14 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2387,34 +2387,34 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s7, s6, 0x30008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_and_b32 s4, s4, 7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s4, s6, 7 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -2426,32 +2426,32 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: udivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s6 -; GFX9-NEXT: s_and_b32 s7, s0, 7 +; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2462,12 +2462,12 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: udivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s6, s0, 0x30008 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x30008 ; GFX10-NEXT: s_and_b32 s0, s0, 7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s1, 0, s6 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s1, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2475,17 +2475,17 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2505,34 +2505,34 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s7, s7, 0x7ffffff -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_and_b32 s4, s4, 0x7ffffff ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s4, s6, 0x7ffffff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2544,32 +2544,32 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s6, s1, 0x7ffffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_sub_i32 s1, 0, s6 -; GFX9-NEXT: s_and_b32 s7, s0, 0x7ffffff +; GFX9-NEXT: s_and_b32 s4, s1, 0x7ffffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 0x7ffffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2580,12 +2580,12 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX10-NEXT: s_and_b32 s4, s1, 0x7ffffff ; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s1, 0, s6 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s1, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2593,17 +2593,17 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index 83cb92210ec84..c9a9eb9d91724 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v3i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v5, 16 @@ -18,7 +18,7 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v0, v2, s[6:7] @@ -28,7 +28,7 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0 ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4 ; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 @@ -38,8 +38,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: global_store_short v1, v0, s[2:3] -; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[2:3] offset:2 +; GFX906-NEXT: global_store_short v1, v0, s[0:1] +; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[0:1] offset:2 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -61,21 +61,21 @@ bb.2: define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v4i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dword v1, v2, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v1, v2, s[6:7] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v0, v1, s[2:3] +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -97,30 +97,30 @@ bb.2: define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v5i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX906-NEXT: global_store_byte v4, v1, s[2:3] -; GFX906-NEXT: global_store_byte v4, v0, s[2:3] offset:1 -; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[2:3] offset:2 -; GFX906-NEXT: global_store_byte v4, v3, s[2:3] offset:3 -; GFX906-NEXT: global_store_byte v4, v2, s[2:3] offset:4 +; GFX906-NEXT: global_store_byte v4, v1, s[0:1] +; GFX906-NEXT: global_store_byte v4, v0, s[0:1] offset:1 +; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[0:1] offset:2 +; GFX906-NEXT: global_store_byte v4, v3, s[0:1] offset:3 +; GFX906-NEXT: global_store_byte v4, v2, s[0:1] offset:4 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -142,21 +142,21 @@ bb.2: define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v8i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -178,21 +178,21 @@ bb.2: define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v16i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -214,25 +214,25 @@ bb.2: define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5] ; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7] ; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16 ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[2:3] offset:16 +; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -254,24 +254,24 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX906-NEXT: s_mov_b32 s10, -1 -; GFX906-NEXT: s_mov_b32 s11, 0xe00000 -; GFX906-NEXT: s_add_u32 s8, s8, s3 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX906-NEXT: s_addc_u32 s9, s9, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] +; GFX906-NEXT: s_mov_b32 s14, -1 +; GFX906-NEXT: s_mov_b32 s15, 0xe00000 +; GFX906-NEXT: s_add_u32 s12, s12, s9 +; GFX906-NEXT: s_addc_u32 s13, s13, 0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] offset:16 ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[4:5] offset:32 @@ -288,16 +288,16 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[4:5] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[4:5] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] offset:240 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[6:7] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[6:7] offset:32 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[6:7] offset:48 @@ -314,13 +314,13 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[6:7] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] offset:240 ; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GFX906-NEXT: v_mov_b32_e32 v0, v57 ; GFX906-NEXT: v_mov_b32_e32 v1, v58 ; GFX906-NEXT: v_mov_b32_e32 v2, v59 @@ -377,34 +377,34 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_mov_b32_e32 v11, v7 ; GFX906-NEXT: v_mov_b32_e32 v10, v6 ; GFX906-NEXT: v_mov_b32_e32 v9, v5 -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[2:3] -; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[2:3] offset:16 -; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[2:3] offset:32 -; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[2:3] offset:48 -; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[2:3] offset:64 -; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[2:3] offset:80 -; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[2:3] offset:96 -; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[2:3] offset:112 -; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[2:3] offset:128 -; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[2:3] offset:144 -; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[2:3] offset:160 -; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[2:3] offset:176 -; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[2:3] offset:192 -; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[2:3] offset:208 -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[0:1] offset:16 +; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[0:1] offset:32 +; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[0:1] offset:48 +; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[0:1] offset:64 +; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[0:1] offset:80 +; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[0:1] offset:96 +; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112 +; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:128 +; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[0:1] offset:144 +; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[0:1] offset:160 +; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[0:1] offset:176 +; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[0:1] offset:192 +; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:208 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:224 +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -427,26 +427,26 @@ bb.2: define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: repeat_successor: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX906-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_cmp_lt_i32 s2, 3 +; GFX906-NEXT: s_cmp_lt_i32 s0, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX906-NEXT: ; %bb.1: ; %LeafBlock -; GFX906-NEXT: s_cmp_ge_i32 s2, 1 +; GFX906-NEXT: s_cmp_ge_i32 s0, 1 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 ; GFX906-NEXT: ; %bb.2: ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX906-NEXT: global_load_dword v0, v0, s[4:5] ; GFX906-NEXT: s_branch .LBB7_5 ; GFX906-NEXT: .LBB7_3: ; %LeafBlock5 -; GFX906-NEXT: s_cmp_eq_u32 s2, 3 +; GFX906-NEXT: s_cmp_eq_u32 s0, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 ; GFX906-NEXT: ; %bb.4: ; %sw.bb5 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX906-NEXT: global_load_dword v0, v0, s[6:7] ; GFX906-NEXT: .LBB7_5: ; %return.sink.split -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[0:1] @@ -479,7 +479,7 @@ return: define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -533,7 +533,7 @@ bb.3: define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_multi_block: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -584,14 +584,14 @@ bb.3: define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_loop_carried: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xff ; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v1, v1, s[2:3] -; GFX906-NEXT: s_mov_b64 s[2:3], 0 +; GFX906-NEXT: global_load_dword v1, v1, s[0:1] +; GFX906-NEXT: s_mov_b64 s[0:1], 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0 @@ -602,13 +602,13 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX906-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GFX906-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] ; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1 -; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_cbranch_execnz .LBB10_1 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll index 037210a496d6d..ef2e57eafbf13 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 { ; GFX8-LABEL: constant_load_i8_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a ; ; GFX9-LABEL: constant_load_i8_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -29,7 +29,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a ; ; GFX10-LABEL: constant_load_i8_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -45,7 +45,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 { ; GFX8-LABEL: constant_load_i16_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr ; ; GFX9-LABEL: constant_load_i16_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -68,7 +68,7 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr ; ; GFX10-LABEL: constant_load_i16_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: sextload_i8_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -97,7 +97,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: sextload_i8_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -109,7 +109,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: sextload_i8_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -127,7 +127,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: sextload_i16_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -140,7 +140,7 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: sextload_i16_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: sextload_i16_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -170,7 +170,7 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: zextload_i8_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -183,7 +183,7 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: zextload_i8_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -195,7 +195,7 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: zextload_i8_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -213,7 +213,7 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: zextload_i16_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -226,7 +226,7 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: zextload_i16_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -238,7 +238,7 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: zextload_i16_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -256,7 +256,7 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_load_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -269,7 +269,7 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: constant_load_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -279,7 +279,7 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: constant_load_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -294,7 +294,7 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_load_i16_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -307,7 +307,7 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: constant_load_i16_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -317,7 +317,7 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a ; ; GFX10-LABEL: constant_load_i16_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -332,7 +332,7 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_sextload_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -351,7 +351,7 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: constant_sextload_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3] @@ -362,7 +362,7 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: constant_sextload_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3] @@ -379,7 +379,7 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_zextload_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -398,7 +398,7 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: constant_zextload_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -409,7 +409,7 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: constant_zextload_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index 422e2747094ce..e9797fa1fc309 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: s_add_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX8-LABEL: s_add_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: s_add_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -47,7 +47,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: s_add_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -59,7 +59,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: s_add_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -73,7 +73,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: s_add_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -95,7 +95,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: s_add_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -110,7 +110,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: s_add_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_add_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: s_add_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -153,7 +153,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: s_add_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -169,7 +169,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: s_add_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -193,7 +193,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: s_add_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 @@ -212,7 +212,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: s_add_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -231,7 +231,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_add_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -249,7 +249,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: s_add_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -267,7 +267,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: s_add_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -286,7 +286,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: s_add_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -313,36 +313,36 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) { ; GFX6-LABEL: s_add_v8i32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 +; GFX6-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s23, 0xf000 +; GFX6-NEXT: s_mov_b32 s22, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s11, s11, s19 -; GFX6-NEXT: s_add_i32 s10, s10, s18 -; GFX6-NEXT: s_add_i32 s9, s9, s17 -; GFX6-NEXT: s_add_i32 s8, s8, s16 -; GFX6-NEXT: s_add_i32 s7, s7, s15 -; GFX6-NEXT: s_add_i32 s6, s6, s14 -; GFX6-NEXT: s_add_i32 s5, s5, s13 -; GFX6-NEXT: s_add_i32 s4, s4, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: v_mov_b32_e32 v3, s11 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_add_i32 s0, s7, s15 +; GFX6-NEXT: s_add_i32 s1, s6, s14 +; GFX6-NEXT: s_add_i32 s2, s5, s13 +; GFX6-NEXT: s_add_i32 s3, s4, s12 +; GFX6-NEXT: s_add_i32 s4, s11, s19 +; GFX6-NEXT: s_add_i32 s5, s10, s18 +; GFX6-NEXT: s_add_i32 s6, s9, s17 +; GFX6-NEXT: s_add_i32 s7, s8, s16 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_add_v8i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s7, s7, s15 ; GFX8-NEXT: s_add_i32 s6, s6, s14 @@ -372,9 +372,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX9-LABEL: s_add_v8i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s7, s15 ; GFX9-NEXT: s_add_i32 s3, s6, s14 @@ -399,9 +399,10 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX10-LABEL: s_add_v8i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s2, s7, s15 ; GFX10-NEXT: s_add_i32 s3, s6, s14 @@ -426,8 +427,8 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX11-LABEL: s_add_v8i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s2, s7, s15 ; GFX11-NEXT: s_add_i32 s3, s6, s14 @@ -452,8 +453,8 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX12-LABEL: s_add_v8i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s7, s15 ; GFX12-NEXT: s_add_co_i32 s3, s6, s14 @@ -483,58 +484,58 @@ entry: define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <16 x i32> %b) { ; GFX6-LABEL: s_add_v16i32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; GFX6-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x29 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; GFX6-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29 +; GFX6-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s23, 0xf000 +; GFX6-NEXT: s_mov_b32 s22, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s19, s19, s51 -; GFX6-NEXT: s_add_i32 s18, s18, s50 -; GFX6-NEXT: s_add_i32 s17, s17, s49 -; GFX6-NEXT: s_add_i32 s16, s16, s48 -; GFX6-NEXT: s_add_i32 s15, s15, s47 -; GFX6-NEXT: s_add_i32 s14, s14, s46 -; GFX6-NEXT: s_add_i32 s13, s13, s45 -; GFX6-NEXT: s_add_i32 s12, s12, s44 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 -; GFX6-NEXT: s_add_i32 s11, s11, s43 -; GFX6-NEXT: s_add_i32 s10, s10, s42 -; GFX6-NEXT: s_add_i32 s9, s9, s41 -; GFX6-NEXT: s_add_i32 s8, s8, s40 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: s_add_i32 s0, s7, s39 +; GFX6-NEXT: s_add_i32 s1, s6, s38 +; GFX6-NEXT: s_add_i32 s2, s5, s37 +; GFX6-NEXT: s_add_i32 s3, s4, s36 +; GFX6-NEXT: s_add_i32 s4, s11, s43 +; GFX6-NEXT: s_add_i32 s5, s10, s42 +; GFX6-NEXT: s_add_i32 s6, s9, s41 +; GFX6-NEXT: s_add_i32 s7, s8, s40 +; GFX6-NEXT: s_add_i32 s8, s15, s47 +; GFX6-NEXT: s_add_i32 s9, s14, s46 +; GFX6-NEXT: s_add_i32 s10, s13, s45 +; GFX6-NEXT: s_add_i32 s11, s12, s44 +; GFX6-NEXT: s_add_i32 s12, s19, s51 +; GFX6-NEXT: s_add_i32 s13, s18, s50 +; GFX6-NEXT: s_add_i32 s14, s17, s49 +; GFX6-NEXT: s_add_i32 s15, s16, s48 +; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mov_b32_e32 v1, s14 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: s_add_i32 s7, s7, s39 -; GFX6-NEXT: s_add_i32 s6, s6, s38 -; GFX6-NEXT: s_add_i32 s5, s5, s37 -; GFX6-NEXT: s_add_i32 s4, s4, s36 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: v_mov_b32_e32 v3, s11 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_add_v16i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GFX8-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX8-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s7, s7, s39 ; GFX8-NEXT: s_add_i32 s6, s6, s38 @@ -590,10 +591,10 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; ; GFX9-LABEL: s_add_v16i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s7, s39 ; GFX9-NEXT: s_add_i32 s3, s6, s38 @@ -637,11 +638,11 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; ; GFX10-LABEL: s_add_v16i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s2, s7, s39 ; GFX10-NEXT: s_add_i32 s3, s6, s38 @@ -684,9 +685,9 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX11-LABEL: s_add_v16i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 -; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 +; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0xa4 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s2, s7, s39 ; GFX11-NEXT: s_add_i32 s3, s6, s38 @@ -725,9 +726,9 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX12-LABEL: s_add_v16i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 -; GFX12-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 +; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0xa4 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s7, s39 ; GFX12-NEXT: s_add_co_i32 s3, s6, s38 @@ -771,7 +772,7 @@ entry: define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_add_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -792,7 +793,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX8-LABEL: v_add_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -812,7 +813,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_add_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -826,7 +827,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: v_add_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -840,9 +841,11 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: v_add_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -856,9 +859,11 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: v_add_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -882,7 +887,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_add_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -901,7 +906,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_add_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -917,7 +922,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_add_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -929,7 +934,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_add_imm_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -941,8 +946,10 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_add_imm_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -954,8 +961,10 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: v_add_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -976,8 +985,8 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: add64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -992,8 +1001,8 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX8-LABEL: add64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s0, s6, s0 @@ -1006,12 +1015,12 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX9-LABEL: add64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s6, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s3 +; GFX9-NEXT: s_add_u32 s0, s6, s0 +; GFX9-NEXT: s_addc_u32 s1, s7, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -1020,12 +1029,12 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX10-LABEL: add64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s6, s2 -; GFX10-NEXT: s_addc_u32 s1, s7, s3 +; GFX10-NEXT: s_add_u32 s0, s6, s0 +; GFX10-NEXT: s_addc_u32 s1, s7, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -1034,8 +1043,8 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: add64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s6, s0 ; GFX11-NEXT: s_addc_u32 s1, s7, s1 @@ -1049,8 +1058,8 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-LABEL: add64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1072,8 +1081,8 @@ entry: define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr addrspace(1) %in) { ; GFX6-LABEL: add64_sgpr_vgpr: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1090,8 +1099,8 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; ; GFX8-LABEL: add64_sgpr_vgpr: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1106,11 +1115,11 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; ; GFX9-LABEL: add64_sgpr_vgpr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -1122,11 +1131,11 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX10-LABEL: add64_sgpr_vgpr: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s6, s0 ; GFX10-NEXT: s_addc_u32 s1, s7, s1 @@ -1138,8 +1147,8 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX11-LABEL: add64_sgpr_vgpr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1155,8 +1164,8 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX12-LABEL: add64_sgpr_vgpr: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1178,7 +1187,7 @@ entry: define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; GFX6-LABEL: add64_in_branch: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -1205,7 +1214,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add64_in_branch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -1231,7 +1240,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -1256,7 +1265,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: add64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB9_4 @@ -1279,7 +1288,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: add64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 @@ -1303,7 +1312,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: add64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB9_4 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 6f67ce4de9ce5..b751be51a9739 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -33,13 +33,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 @@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: v_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 @@ -65,10 +65,12 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: v_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -93,8 +95,8 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; VI-LABEL: s_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -114,37 +116,37 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: s_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s0, s1 +; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 @@ -165,7 +167,7 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; VI-LABEL: s_test_add_self_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -184,7 +186,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: s_test_add_self_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -195,7 +197,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_add_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -206,7 +208,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_add_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -226,7 +228,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; VI-LABEL: s_test_add_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -243,7 +245,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_test_add_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -253,7 +255,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_add_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 @@ -262,7 +264,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX11-LABEL: s_test_add_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 @@ -279,7 +281,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x1c8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -298,7 +300,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_add_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -311,7 +313,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_add_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -323,8 +325,10 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_add_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -346,7 +350,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -365,7 +369,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -378,7 +382,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_add_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -390,8 +394,10 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_add_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -412,7 +418,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -431,7 +437,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_add_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -443,7 +449,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_add_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -455,8 +461,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_add_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -477,7 +485,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -495,7 +503,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -507,7 +515,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -519,8 +527,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -542,7 +552,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f80 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -560,7 +570,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -572,7 +582,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -584,8 +594,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -607,8 +619,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -630,14 +642,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -648,13 +660,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -666,10 +678,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -699,8 +713,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -724,14 +738,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 @@ -743,13 +757,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -762,8 +776,10 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -795,8 +811,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -820,14 +836,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 @@ -838,13 +854,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -856,10 +872,12 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -889,8 +907,8 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -915,13 +933,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 @@ -935,14 +953,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -956,10 +974,12 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 526d5c946ec7f..1315d576a83eb 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,5 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s +; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri -mattr=-promote-alloca < %s | llc | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s +; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 -mattr=-promote-alloca < %s | llc | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s + +target triple = "amdgcn-amd-amdhsa" ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 60f61a67ccf0b..fb96b9ff2952e 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -245,7 +245,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: no_agpr_no_reserve: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 @@ -303,7 +303,8 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: no_agpr_no_reserve: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 @@ -514,14 +515,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX908-NEXT: s_load_dword s9, s[4:5], 0x18 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX908-NEXT: s_load_dword s9, s[6:7], 0x18 ; GFX908-NEXT: s_mov_b32 s8, 0 -; GFX908-NEXT: s_mov_b32 s5, s8 +; GFX908-NEXT: s_mov_b32 s7, s8 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX908-NEXT: s_sub_i32 s4, 0, s3 +; GFX908-NEXT: s_sub_i32 s6, 0, s3 ; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s9 ; GFX908-NEXT: v_mov_b32_e32 v19, 0 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -530,32 +531,32 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: s_mul_i32 s4, s4, s10 -; GFX908-NEXT: s_mul_hi_u32 s4, s10, s4 -; GFX908-NEXT: s_add_i32 s10, s10, s4 -; GFX908-NEXT: s_mul_hi_u32 s4, s2, s10 -; GFX908-NEXT: s_mul_i32 s10, s4, s3 +; GFX908-NEXT: s_mul_i32 s6, s6, s10 +; GFX908-NEXT: s_mul_hi_u32 s6, s10, s6 +; GFX908-NEXT: s_add_i32 s10, s10, s6 +; GFX908-NEXT: s_mul_hi_u32 s6, s2, s10 +; GFX908-NEXT: s_mul_i32 s10, s6, s3 ; GFX908-NEXT: s_sub_i32 s2, s2, s10 -; GFX908-NEXT: s_add_i32 s11, s4, 1 +; GFX908-NEXT: s_add_i32 s11, s6, 1 ; GFX908-NEXT: s_sub_i32 s10, s2, s3 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s4, s11, s4 +; GFX908-NEXT: s_cselect_b32 s6, s11, s6 ; GFX908-NEXT: s_cselect_b32 s2, s10, s2 -; GFX908-NEXT: s_add_i32 s10, s4, 1 +; GFX908-NEXT: s_add_i32 s10, s6, 1 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s4, s10, s4 +; GFX908-NEXT: s_cselect_b32 s6, s10, s6 ; GFX908-NEXT: s_lshr_b32 s9, s9, 16 -; GFX908-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 +; GFX908-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 ; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s9 ; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 +; GFX908-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 ; GFX908-NEXT: s_or_b32 s10, s10, 28 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s5, v16 -; GFX908-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX908-NEXT: s_mul_i32 s1, s1, s5 -; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5 -; GFX908-NEXT: s_mul_i32 s0, s0, s5 +; GFX908-NEXT: v_readfirstlane_b32 s7, v16 +; GFX908-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX908-NEXT: s_mul_i32 s1, s1, s7 +; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7 +; GFX908-NEXT: s_mul_i32 s0, s0, s7 ; GFX908-NEXT: s_add_i32 s1, s9, s1 ; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 @@ -571,7 +572,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 ; GFX908-NEXT: s_mov_b32 s9, s8 ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] ; GFX908-NEXT: v_mov_b32_e32 v4, s8 @@ -581,20 +582,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_mov_b32_e32 v5, s9 ; GFX908-NEXT: v_mov_b32_e32 v9, s9 ; GFX908-NEXT: v_mov_b32_e32 v7, s9 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s5, v2 +; GFX908-NEXT: v_readfirstlane_b32 s7, v2 ; GFX908-NEXT: v_readfirstlane_b32 s9, v3 -; GFX908-NEXT: s_add_u32 s5, s5, 1 +; GFX908-NEXT: s_add_u32 s7, s7, 1 ; GFX908-NEXT: s_addc_u32 s9, s9, 0 -; GFX908-NEXT: s_mul_hi_u32 s20, s2, s5 +; GFX908-NEXT: s_mul_hi_u32 s20, s2, s7 ; GFX908-NEXT: s_mul_i32 s9, s2, s9 -; GFX908-NEXT: s_mul_i32 s21, s3, s5 +; GFX908-NEXT: s_mul_i32 s21, s3, s7 ; GFX908-NEXT: s_add_i32 s9, s20, s9 -; GFX908-NEXT: s_mul_i32 s5, s2, s5 +; GFX908-NEXT: s_mul_i32 s7, s2, s7 ; GFX908-NEXT: s_add_i32 s9, s9, s21 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 @@ -610,7 +611,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s20, s18, s5 +; GFX908-NEXT: s_add_u32 s20, s18, s7 ; GFX908-NEXT: s_addc_u32 s21, s19, s9 ; GFX908-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -670,8 +671,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s6, s6, s4 -; GFX908-NEXT: s_addc_u32 s7, s7, 0 +; GFX908-NEXT: s_add_u32 s4, s4, s6 +; GFX908-NEXT: s_addc_u32 s5, s5, 0 ; GFX908-NEXT: s_add_u32 s10, s10, s12 ; GFX908-NEXT: s_addc_u32 s11, s11, s13 ; GFX908-NEXT: s_mov_b64 s[0:1], 0 @@ -682,14 +683,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX90A-NEXT: s_load_dword s9, s[6:7], 0x18 ; GFX90A-NEXT: s_mov_b32 s8, 0 -; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s4, 0, s3 +; GFX90A-NEXT: s_sub_i32 s6, 0, s3 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0 @@ -697,32 +698,32 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s9 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v3 -; GFX90A-NEXT: s_mul_i32 s4, s4, s10 -; GFX90A-NEXT: s_mul_hi_u32 s4, s10, s4 -; GFX90A-NEXT: s_add_i32 s10, s10, s4 -; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s10 -; GFX90A-NEXT: s_mul_i32 s10, s4, s3 +; GFX90A-NEXT: s_mul_i32 s6, s6, s10 +; GFX90A-NEXT: s_mul_hi_u32 s6, s10, s6 +; GFX90A-NEXT: s_add_i32 s10, s10, s6 +; GFX90A-NEXT: s_mul_hi_u32 s6, s2, s10 +; GFX90A-NEXT: s_mul_i32 s10, s6, s3 ; GFX90A-NEXT: s_sub_i32 s2, s2, s10 -; GFX90A-NEXT: s_add_i32 s11, s4, 1 +; GFX90A-NEXT: s_add_i32 s11, s6, 1 ; GFX90A-NEXT: s_sub_i32 s10, s2, s3 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s4, s11, s4 +; GFX90A-NEXT: s_cselect_b32 s6, s11, s6 ; GFX90A-NEXT: s_cselect_b32 s2, s10, s2 -; GFX90A-NEXT: s_add_i32 s10, s4, 1 +; GFX90A-NEXT: s_add_i32 s10, s6, 1 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s4, s10, s4 +; GFX90A-NEXT: s_cselect_b32 s6, s10, s6 ; GFX90A-NEXT: s_lshr_b32 s9, s9, 16 -; GFX90A-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 +; GFX90A-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s9 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 ; GFX90A-NEXT: s_or_b32 s10, s10, 28 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s5, v18 -; GFX90A-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX90A-NEXT: s_mul_i32 s1, s1, s5 -; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5 -; GFX90A-NEXT: s_mul_i32 s0, s0, s5 +; GFX90A-NEXT: v_readfirstlane_b32 s7, v18 +; GFX90A-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX90A-NEXT: s_mul_i32 s1, s1, s7 +; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7 +; GFX90A-NEXT: s_mul_i32 s0, s0, s7 ; GFX90A-NEXT: s_add_i32 s1, s9, s1 ; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 @@ -738,26 +739,26 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 ; GFX90A-NEXT: s_mov_b32 s9, s8 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 ; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 +; GFX90A-NEXT: v_readfirstlane_b32 s7, v4 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 -; GFX90A-NEXT: s_add_u32 s5, s5, 1 +; GFX90A-NEXT: s_add_u32 s7, s7, 1 ; GFX90A-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NEXT: s_mul_hi_u32 s20, s2, s5 +; GFX90A-NEXT: s_mul_hi_u32 s20, s2, s7 ; GFX90A-NEXT: s_mul_i32 s9, s2, s9 -; GFX90A-NEXT: s_mul_i32 s21, s3, s5 +; GFX90A-NEXT: s_mul_i32 s21, s3, s7 ; GFX90A-NEXT: s_add_i32 s9, s20, s9 -; GFX90A-NEXT: s_mul_i32 s5, s2, s5 +; GFX90A-NEXT: s_mul_i32 s7, s2, s7 ; GFX90A-NEXT: s_add_i32 s9, s9, s21 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 @@ -773,7 +774,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s20, s18, s5 +; GFX90A-NEXT: s_add_u32 s20, s18, s7 ; GFX90A-NEXT: s_addc_u32 s21, s19, s9 ; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -826,8 +827,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s6, s6, s4 -; GFX90A-NEXT: s_addc_u32 s7, s7, 0 +; GFX90A-NEXT: s_add_u32 s4, s4, s6 +; GFX90A-NEXT: s_addc_u32 s5, s5, 0 ; GFX90A-NEXT: s_add_u32 s10, s10, s12 ; GFX90A-NEXT: s_addc_u32 s11, s11, s13 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll index bd5dc6e207098..8d87b53efb4e7 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -175,4 +175,4 @@ bb: ret void } -attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index 0c5e1ec0d5b6f..0a461f9ee6c96 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapture readonly, ptr addrspace(1) noalias nocapture readonly) { ; GCN-LABEL: readfirstlane_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll index 330cf48803680..def6df9adf597 100644 --- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll +++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll @@ -41,16 +41,16 @@ define void @test1() { define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; GFX9-LABEL: test2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lt_i32 s2, 1 +; GFX9-NEXT: s_cmp_lt_i32 s0, 1 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB2_2: ; %then @@ -58,16 +58,16 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; ; GFX10-LABEL: test2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lt_i32 s2, 1 +; GFX10-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; GFX10-NEXT: .LBB2_2: ; %then @@ -75,15 +75,15 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; ; GFX11-LABEL: test2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-NEXT: s_cmp_lt_i32 s0, 1 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB2_2: ; %then diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index cb59121d69708..bf72cccd912ce 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -392,7 +392,7 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x83 ; GCN-NEXT: v_mov_b32_e32 v1, 0x80 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 8144fb7a3b646..7cf18171a6cd7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: udiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,7 +72,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -137,7 +137,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: urem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -167,7 +167,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -241,7 +241,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: sdiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -280,7 +280,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_abs_i32 s4, s3 @@ -359,7 +359,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: srem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -394,7 +394,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_abs_i32 s3, s3 @@ -452,15 +452,15 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: udiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 @@ -468,19 +468,20 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -488,6 +489,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i16 %x, %y @@ -521,36 +523,37 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: urem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s2, s4, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_and_b32 s3, s4, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_lshr_b32 s5, s4, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_and_b32 s0, s4, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s4, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -559,8 +562,8 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -597,8 +600,8 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: sdiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -623,27 +626,27 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s4, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_sext_i32_i16 s1, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_ashr_i32 s2, s4, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 -; GFX9-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 +; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i16 %x, %y store i16 %r, ptr addrspace(1) %out @@ -680,8 +683,8 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: srem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s5, s4, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -708,7 +711,8 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: srem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -718,7 +722,6 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s6, s2, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -730,7 +733,6 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i16 %x, %y @@ -762,8 +764,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: udiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -781,13 +783,13 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s4 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -827,8 +829,8 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: urem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 @@ -849,13 +851,13 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: urem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX9-NEXT: s_lshr_b32 s2, s4, 8 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -863,9 +865,8 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i8 %x, %y @@ -901,8 +902,8 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: sdiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -927,27 +928,27 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_sext_i32_i8 s1, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i8 s3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 -; GFX9-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 +; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i8 %x, %y store i8 %r, ptr addrspace(1) %out @@ -984,8 +985,8 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: srem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -1013,30 +1014,30 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: srem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_sext_i32_i8 s1, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i8 s3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s0, 1 +; GFX9-NEXT: s_or_b32 s6, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i8 %x, %y store i8 %r, ptr addrspace(1) %out @@ -1178,13 +1179,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: udiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: s_sub_i32 s2, 0, s12 +; GFX6-NEXT: s_sub_i32 s0, 0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s14 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1194,28 +1195,28 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s12 -; GFX6-NEXT: s_sub_i32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s12 -; GFX6-NEXT: s_cmp_ge_u32 s2, s12 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s12 +; GFX6-NEXT: s_sub_i32 s0, s8, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s12 +; GFX6-NEXT: s_cmp_ge_u32 s0, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s12 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s0, s12 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0, s13 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 @@ -1276,9 +1277,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: udiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1498,34 +1499,36 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: urem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s2, 0, s8 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s8 -; GFX6-NEXT: s_sub_i32 s2, s4, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s8 -; GFX6-NEXT: s_cmp_ge_u32 s2, s8 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s8 -; GFX6-NEXT: s_cmp_ge_u32 s2, s8 -; GFX6-NEXT: s_cselect_b32 s4, s3, s2 -; GFX6-NEXT: s_sub_i32 s2, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s8 +; GFX6-NEXT: s_sub_i32 s0, s4, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 @@ -1533,60 +1536,58 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s9 -; GFX6-NEXT: s_sub_i32 s2, s5, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s9 -; GFX6-NEXT: s_cmp_ge_u32 s2, s9 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s9 -; GFX6-NEXT: s_cmp_ge_u32 s2, s9 -; GFX6-NEXT: s_cselect_b32 s5, s3, s2 -; GFX6-NEXT: s_sub_i32 s2, 0, s10 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_mul_i32 s1, s1, s9 +; GFX6-NEXT: s_sub_i32 s1, s5, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s9 +; GFX6-NEXT: s_cmp_ge_u32 s1, s9 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s9 +; GFX6-NEXT: s_cmp_ge_u32 s1, s9 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, 0, s10 +; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s10 -; GFX6-NEXT: s_sub_i32 s2, s6, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s10 -; GFX6-NEXT: s_cmp_ge_u32 s2, s10 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s10 -; GFX6-NEXT: s_cmp_ge_u32 s2, s10 -; GFX6-NEXT: s_cselect_b32 s6, s3, s2 -; GFX6-NEXT: s_sub_i32 s2, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_mul_i32 s4, s4, s10 +; GFX6-NEXT: s_sub_i32 s4, s6, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s10 +; GFX6-NEXT: s_cmp_ge_u32 s4, s10 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s10 +; GFX6-NEXT: s_cmp_ge_u32 s4, s10 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, 0, s11 +; GFX6-NEXT: v_mul_lo_u32 v0, s5, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_readfirstlane_b32 s4, v2 -; GFX6-NEXT: s_mul_i32 s4, s4, s11 -; GFX6-NEXT: s_sub_i32 s4, s7, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s11 -; GFX6-NEXT: s_cmp_ge_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s11 -; GFX6-NEXT: s_cmp_ge_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v2 +; GFX6-NEXT: s_mul_i32 s0, s0, s11 +; GFX6-NEXT: s_sub_i32 s0, s7, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s11 +; GFX6-NEXT: s_cmp_ge_u32 s0, s11 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s11 +; GFX6-NEXT: s_cmp_ge_u32 s0, s11 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1842,34 +1843,34 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s2, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_abs_i32 s0, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_xor_b32 s4, s8, s12 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_abs_i32 s3, s8 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: s_abs_i32 s1, s8 ; GFX6-NEXT: s_ashr_i32 s8, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s2 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_sub_i32 s4, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_mul_i32 s4, s4, s0 +; GFX6-NEXT: s_sub_i32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s4, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_abs_i32 s4, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GFX6-NEXT: s_sub_i32 s5, 0, s4 @@ -1877,7 +1878,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_xor_b32 s6, s9, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 @@ -1964,17 +1965,16 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s2, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_xor_b32 s3, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, 0, s2 +; GFX9-NEXT: s_abs_i32 s0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_xor_b32 s1, s4, s8 +; GFX9-NEXT: s_sub_i32 s8, 0, s0 ; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s3, s3, 31 +; GFX9-NEXT: s_ashr_i32 s1, s1, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0 @@ -1982,81 +1982,82 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s12, s8, s2 +; GFX9-NEXT: s_mul_i32 s12, s8, s0 ; GFX9-NEXT: s_sub_i32 s4, s4, s12 ; GFX9-NEXT: s_add_i32 s13, s8, 1 -; GFX9-NEXT: s_sub_i32 s12, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_sub_i32 s12, s4, s0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 ; GFX9-NEXT: s_cselect_b32 s8, s13, s8 ; GFX9-NEXT: s_cselect_b32 s4, s12, s4 ; GFX9-NEXT: s_add_i32 s12, s8, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s12, s8 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_cselect_b32 s0, s12, s8 ; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_xor_b32 s8, s5, s9 ; GFX9-NEXT: s_sub_i32 s9, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s12, s0, s1 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: s_ashr_i32 s8, s8, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s3 -; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 -; GFX9-NEXT: s_add_i32 s3, s3, s9 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 -; GFX9-NEXT: s_mul_i32 s9, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, s5, s9 -; GFX9-NEXT: s_add_i32 s12, s3, 1 -; GFX9-NEXT: s_sub_i32 s9, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s3, s12, s3 -; GFX9-NEXT: s_cselect_b32 s5, s9, s5 -; GFX9-NEXT: s_add_i32 s9, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s3, s9, s3 -; GFX9-NEXT: s_abs_i32 s4, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s9, 0, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s8 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s0 +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 +; GFX9-NEXT: s_mul_i32 s1, s0, s4 +; GFX9-NEXT: s_sub_i32 s1, s5, s1 +; GFX9-NEXT: s_add_i32 s9, s0, 1 +; GFX9-NEXT: s_sub_i32 s5, s1, s4 +; GFX9-NEXT: s_cmp_ge_u32 s1, s4 +; GFX9-NEXT: s_cselect_b32 s0, s9, s0 +; GFX9-NEXT: s_cselect_b32 s1, s5, s1 +; GFX9-NEXT: s_add_i32 s5, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s4 +; GFX9-NEXT: s_cselect_b32 s0, s5, s0 +; GFX9-NEXT: s_abs_i32 s1, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s8 +; GFX9-NEXT: s_xor_b32 s4, s6, s10 +; GFX9-NEXT: s_abs_i32 s5, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s6, s10 -; GFX9-NEXT: s_abs_i32 s6, s6 -; GFX9-NEXT: s_ashr_i32 s5, s5, 31 +; GFX9-NEXT: s_sub_i32 s6, 0, s1 +; GFX9-NEXT: s_sub_i32 s8, s0, s8 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s8 -; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 -; GFX9-NEXT: s_add_i32 s8, s8, s9 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 -; GFX9-NEXT: s_mul_i32 s9, s8, s4 -; GFX9-NEXT: s_sub_i32 s6, s6, s9 -; GFX9-NEXT: s_add_i32 s10, s8, 1 -; GFX9-NEXT: s_sub_i32 s9, s6, s4 -; GFX9-NEXT: s_cmp_ge_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s8, s10, s8 -; GFX9-NEXT: s_cselect_b32 s6, s9, s6 -; GFX9-NEXT: s_add_i32 s9, s8, 1 -; GFX9-NEXT: s_cmp_ge_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s4, s9, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s0 +; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 +; GFX9-NEXT: s_mul_i32 s6, s0, s1 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s9, s0, 1 +; GFX9-NEXT: s_sub_i32 s6, s5, s1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s1 +; GFX9-NEXT: s_cselect_b32 s0, s9, s0 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s1 +; GFX9-NEXT: s_cselect_b32 s5, s6, s0 ; GFX9-NEXT: s_abs_i32 s6, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_xor_b32 s2, s7, s11 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_abs_i32 s3, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s7, s7, s5 ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 @@ -2076,6 +2077,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i32> %x, %y @@ -2242,35 +2244,34 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: srem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s2, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_abs_i32 s0, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_abs_i32 s3, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: s_abs_i32 s1, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: s_mul_i32 s8, s8, s2 -; GFX6-NEXT: s_sub_i32 s3, s3, s8 -; GFX6-NEXT: s_sub_i32 s8, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s3, s8, s3 -; GFX6-NEXT: s_sub_i32 s8, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s2, s8, s3 -; GFX6-NEXT: s_abs_i32 s3, s9 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s8, 0, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, s4 -; GFX6-NEXT: s_sub_i32 s4, s2, s4 +; GFX6-NEXT: s_mul_i32 s8, s8, s0 +; GFX6-NEXT: s_sub_i32 s1, s1, s8 +; GFX6-NEXT: s_sub_i32 s8, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s1, s8, s1 +; GFX6-NEXT: s_sub_i32 s8, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s0, s8, s1 +; GFX6-NEXT: s_abs_i32 s1, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX6-NEXT: s_sub_i32 s8, 0, s1 +; GFX6-NEXT: s_xor_b32 s0, s0, s4 +; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2280,21 +2281,22 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s8, s2 -; GFX6-NEXT: s_abs_i32 s3, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s8, 0, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, s5 -; GFX6-NEXT: s_sub_i32 s5, s2, s5 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_mul_i32 s4, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s8, s4, s1 +; GFX6-NEXT: s_cmp_ge_u32 s4, s1 +; GFX6-NEXT: s_cselect_b32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s8, s4, s1 +; GFX6-NEXT: s_cmp_ge_u32 s4, s1 +; GFX6-NEXT: s_cselect_b32 s1, s8, s4 +; GFX6-NEXT: s_abs_i32 s4, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX6-NEXT: s_sub_i32 s8, 0, s4 +; GFX6-NEXT: s_xor_b32 s1, s1, s5 +; GFX6-NEXT: s_sub_i32 s1, s1, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 @@ -2303,59 +2305,59 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s8, s8, s2 -; GFX6-NEXT: s_abs_i32 s9, s11 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_sub_i32 s2, 0, s9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_readfirstlane_b32 s5, v0 +; GFX6-NEXT: s_mul_i32 s5, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s8, s5, s4 +; GFX6-NEXT: s_cmp_ge_u32 s5, s4 +; GFX6-NEXT: s_cselect_b32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s8, s5, s4 +; GFX6-NEXT: s_cmp_ge_u32 s5, s4 +; GFX6-NEXT: s_cselect_b32 s4, s8, s5 +; GFX6-NEXT: s_abs_i32 s5, s11 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_sub_i32 s8, 0, s5 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_abs_i32 s4, s7 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v2 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_abs_i32 s0, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, s8, v2 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_xor_b32 s2, s4, s6 +; GFX6-NEXT: s_sub_i32 s2, s2, s6 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_ashr_i32 s5, s7, 31 -; GFX6-NEXT: s_xor_b32 s7, s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_ashr_i32 s1, s7, 31 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2 -; GFX6-NEXT: s_sub_i32 s6, s7, s6 -; GFX6-NEXT: v_readfirstlane_b32 s7, v2 -; GFX6-NEXT: s_mul_i32 s7, s7, s9 -; GFX6-NEXT: s_sub_i32 s4, s4, s7 -; GFX6-NEXT: s_sub_i32 s7, s4, s9 -; GFX6-NEXT: s_cmp_ge_u32 s4, s9 -; GFX6-NEXT: s_cselect_b32 s4, s7, s4 -; GFX6-NEXT: s_sub_i32 s7, s4, s9 -; GFX6-NEXT: s_cmp_ge_u32 s4, s9 -; GFX6-NEXT: s_cselect_b32 s4, s7, s4 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v2 +; GFX6-NEXT: v_readfirstlane_b32 s3, v2 +; GFX6-NEXT: s_mul_i32 s3, s3, s5 +; GFX6-NEXT: s_sub_i32 s0, s0, s3 +; GFX6-NEXT: s_sub_i32 s3, s0, s5 +; GFX6-NEXT: s_cmp_ge_u32 s0, s5 +; GFX6-NEXT: s_cselect_b32 s0, s3, s0 +; GFX6-NEXT: s_sub_i32 s3, s0, s5 +; GFX6-NEXT: s_cmp_ge_u32 s0, s5 +; GFX6-NEXT: s_cselect_b32 s0, s3, s0 +; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s2, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s8, 0, s2 -; GFX9-NEXT: s_ashr_i32 s3, s4, 31 +; GFX9-NEXT: s_abs_i32 s0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_sub_i32 s8, 0, s0 +; GFX9-NEXT: s_ashr_i32 s1, s4, 31 ; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2365,72 +2367,73 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s8, s8, s2 +; GFX9-NEXT: s_mul_i32 s8, s8, s0 ; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_sub_i32 s8, s4, s0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 ; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_sub_i32 s8, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s8, s4 +; GFX9-NEXT: s_sub_i32 s8, s4, s0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_cselect_b32 s0, s8, s4 ; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_sub_i32 s9, 0, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s12, s0, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s3 -; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 -; GFX9-NEXT: s_add_i32 s3, s3, s9 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 -; GFX9-NEXT: s_mul_i32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s5, s3, s4 -; GFX9-NEXT: s_cmp_ge_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s5, s3, s4 -; GFX9-NEXT: s_cmp_ge_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_abs_i32 s4, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s9, 0, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s8 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s0 +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s4 +; GFX9-NEXT: s_sub_i32 s0, s5, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s4 +; GFX9-NEXT: s_cmp_ge_u32 s0, s4 +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s4 +; GFX9-NEXT: s_cmp_ge_u32 s0, s4 +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: s_abs_i32 s1, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s8 +; GFX9-NEXT: s_ashr_i32 s4, s6, 31 +; GFX9-NEXT: s_abs_i32 s5, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s5, s6, 31 -; GFX9-NEXT: s_abs_i32 s6, s6 +; GFX9-NEXT: s_sub_i32 s6, 0, s1 +; GFX9-NEXT: s_sub_i32 s8, s0, s8 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s8 -; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 -; GFX9-NEXT: s_add_i32 s8, s8, s9 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 -; GFX9-NEXT: s_mul_i32 s8, s8, s4 -; GFX9-NEXT: s_sub_i32 s6, s6, s8 -; GFX9-NEXT: s_sub_i32 s8, s6, s4 -; GFX9-NEXT: s_cmp_ge_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_sub_i32 s8, s6, s4 -; GFX9-NEXT: s_cmp_ge_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s4, s8, s6 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s0 +; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s5, s0 +; GFX9-NEXT: s_sub_i32 s5, s0, s1 +; GFX9-NEXT: s_cmp_ge_u32 s0, s1 +; GFX9-NEXT: s_cselect_b32 s0, s5, s0 +; GFX9-NEXT: s_sub_i32 s5, s0, s1 +; GFX9-NEXT: s_cmp_ge_u32 s0, s1 +; GFX9-NEXT: s_cselect_b32 s5, s5, s0 ; GFX9-NEXT: s_abs_i32 s6, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_ashr_i32 s2, s7, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_abs_i32 s3, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s6 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s7, s7, s5 ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 @@ -2448,6 +2451,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <4 x i32> %x, %y @@ -2542,8 +2546,8 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: udiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2602,21 +2606,21 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_and_b32 s1, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s0, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -2654,6 +2658,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y @@ -2756,8 +2761,8 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: urem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2824,35 +2829,34 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: s_and_b32 s9, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: s_and_b32 s8, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: s_and_b32 s3, s7, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s8, s5, 0xffff +; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc @@ -2867,20 +2871,21 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 @@ -2994,8 +2999,8 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3074,79 +3079,79 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_sext_i32_i16 s1, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_sext_i32_i16 s2, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s8, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s8, 0 -; GFX9-NEXT: s_ashr_i32 s1, s6, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s8, 0 +; GFX9-NEXT: s_ashr_i32 s3, s6, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v3 ; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 -; GFX9-NEXT: s_xor_b32 s0, s4, s1 +; GFX9-NEXT: s_xor_b32 s2, s4, s3 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: s_or_b32 s4, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s1, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 -; GFX9-NEXT: s_sext_i32_i16 s0, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX9-NEXT: s_sext_i32_i16 s3, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 +; GFX9-NEXT: s_sext_i32_i16 s2, s5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: s_ashr_i32 s1, s7, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 -; GFX9-NEXT: s_ashr_i32 s0, s5, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: s_ashr_i32 s3, s7, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v1, s2, v5 +; GFX9-NEXT: s_ashr_i32 s2, s5, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v6 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3264,8 +3269,8 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: srem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3356,78 +3361,78 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GFX9-NEXT: s_sext_i32_i16 s9, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX9-NEXT: s_xor_b32 s0, s9, s8 +; GFX9-NEXT: s_xor_b32 s2, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s10, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s10, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s10, 0 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 +; GFX9-NEXT: v_add_u32_e32 v1, s2, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s0, s4, s6 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_xor_b32 s2, s4, s6 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: s_or_b32 s8, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| +; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s8, 0 ; GFX9-NEXT: s_sext_i32_i16 s8, s7 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v4 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX9-NEXT: s_sext_i32_i16 s6, s5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s0, s6, s8 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s10, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s6, s8 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s10, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s10, 0 ; GFX9-NEXT: s_ashr_i32 s7, s7, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_xor_b32 s0, s5, s7 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_xor_b32 s2, s5, s7 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s8 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_or_b32 s8, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s8, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s8, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v6 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s7 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s6, v3 @@ -3436,7 +3441,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3467,8 +3472,8 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: udiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 @@ -3489,15 +3494,15 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: udiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX9-NEXT: s_bfe_u32 s2, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s0, s4, 7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX9-NEXT: s_and_b32 s2, s4, 7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -3505,7 +3510,7 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX9-NEXT: global_store_byte v2, v0, s[2:3] +; GFX9-NEXT: global_store_byte v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i3 %x, %y store i3 %r, ptr addrspace(1) %out @@ -3538,8 +3543,8 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: urem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -3563,24 +3568,24 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: urem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s4, s2, 7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: s_and_b32 s1, s4, 7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s1 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] @@ -3618,8 +3623,8 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: sdiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3645,28 +3650,28 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: sdiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i3 %x, %y store i3 %r, ptr addrspace(1) %out @@ -3703,8 +3708,8 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: srem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -3733,27 +3738,27 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: srem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_or_b32 s6, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 @@ -3832,8 +3837,8 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: udiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3879,21 +3884,21 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_and_b32 s1, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s0, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -3918,6 +3923,7 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v6, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -3999,8 +4005,8 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: urem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4052,33 +4058,33 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: s_and_b32 s9, s6, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s8, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc @@ -4087,17 +4093,16 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4185,8 +4190,8 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4247,62 +4252,62 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_sext_i32_i16 s1, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_sext_i32_i16 s2, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s8, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s8, 0 -; GFX9-NEXT: s_ashr_i32 s1, s6, 16 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s8, 0 +; GFX9-NEXT: s_ashr_i32 s3, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 +; GFX9-NEXT: v_add_u32_e32 v2, s2, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s0, s4, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s4, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s1, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 -; GFX9-NEXT: s_sext_i32_i16 s0, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX9-NEXT: s_sext_i32_i16 s3, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v4 +; GFX9-NEXT: s_sext_i32_i16 s2, s5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v1, v2, s[2:3] +; GFX9-NEXT: global_store_short v1, v0, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4394,8 +4399,8 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: srem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4464,7 +4469,8 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 @@ -4516,7 +4522,6 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cselect_b32 s2, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -4524,7 +4529,6 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4600,33 +4604,31 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: udiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_and_b32 s5, s8, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_and_b32 s4, s6, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX6-NEXT: s_bfe_u32 s4, s8, 0xf000f +; GFX6-NEXT: s_and_b32 s2, s10, 0x7fff +; GFX6-NEXT: s_and_b32 s3, s0, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: s_bfe_u32 s5, s6, 0xf000f -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX6-NEXT: s_bfe_u32 s3, s10, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -4649,31 +4651,33 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s6, 0x7fff -; GFX9-NEXT: s_and_b32 s1, s2, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX9-NEXT: s_bfe_u32 s0, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff +; GFX9-NEXT: s_and_b32 s3, s0, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f +; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -4787,41 +4791,41 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: urem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: s_and_b32 s7, s8, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX6-NEXT: s_and_b32 s3, s10, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: s_bfe_u32 s5, s8, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX6-NEXT: s_bfe_u32 s7, s6, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: s_bfe_u32 s8, s10, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s6, v1 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s10, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -4830,32 +4834,32 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 -; GFX6-NEXT: s_lshr_b32 s5, s8, 15 +; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_lshr_b32 s4, s6, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 +; GFX6-NEXT: s_lshr_b32 s2, s10, 15 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 @@ -4996,52 +5000,50 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: s_bfe_i32 s2, s0, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 +; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_xor_b32 s1, s1, s2 +; GFX6-NEXT: s_ashr_i32 s1, s1, 30 +; GFX6-NEXT: s_or_b32 s1, s1, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: s_or_b32 s7, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| +; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_cselect_b32 s4, s7, 0 -; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v4 -; GFX6-NEXT: s_bfe_i32 s4, s6, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX6-NEXT: s_cselect_b32 s1, s1, 0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s1, v4 +; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: v_alignbit_b32 v1, s9, v1, 30 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_xor_b32 s0, s1, s0 +; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_or_b32 s2, s0, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX6-NEXT: s_cselect_b32 s0, s2, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -5059,43 +5061,46 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_bfe_i32 s2, s0, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 -; GFX9-NEXT: s_or_b32 s3, s0, 1 +; GFX9-NEXT: s_xor_b32 s1, s1, s2 +; GFX9-NEXT: s_ashr_i32 s1, s1, 30 +; GFX9-NEXT: s_or_b32 s1, s1, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s0, s3, 0 -; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 -; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 +; GFX9-NEXT: v_add_u32_e32 v4, s1, v5 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -5105,7 +5110,6 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s2, 0 @@ -5223,73 +5227,73 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: srem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s5 +; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX6-NEXT: s_xor_b32 s1, s2, s1 +; GFX6-NEXT: s_ashr_i32 s1, s1, 30 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_mov_b32 s5, s9 +; GFX6-NEXT: s_lshr_b32 s8, s10, 15 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v6, v6 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: s_lshr_b32 s7, s6, 15 -; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 -; GFX6-NEXT: s_lshr_b32 s9, s8, 15 -; GFX6-NEXT: s_or_b32 s10, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, |v4| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s10, 0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v6 -; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s4 -; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s5 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, s8 +; GFX6-NEXT: s_lshr_b32 s9, s0, 15 +; GFX6-NEXT: s_or_b32 s1, s1, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| +; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX6-NEXT: s_cselect_b32 s1, s1, 0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s1, v6 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 +; GFX6-NEXT: s_xor_b32 s0, s1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v7, v7 ; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s6, v4 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_or_b32 s2, s0, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_cselect_b32 s0, s2, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, s0, v7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 @@ -5297,54 +5301,54 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s2, s6, 0xf0000 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 +; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX9-NEXT: s_xor_b32 s1, s2, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s1, s1, 30 ; GFX9-NEXT: s_lshr_b32 s8, s6, 15 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 -; GFX9-NEXT: s_lshr_b32 s3, s2, 15 -; GFX9-NEXT: s_or_b32 s7, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s7, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 -; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_lshr_b32 s7, s0, 15 +; GFX9-NEXT: s_or_b32 s1, s1, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s1, v6 +; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 @@ -5363,7 +5367,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v5, s3 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -5393,8 +5397,8 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -5409,17 +5413,17 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s1, s4, s0 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 20 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s3, s4, s2 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, 20 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -5434,8 +5438,8 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5446,13 +5450,13 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s4, 12 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -5468,7 +5472,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: udiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5482,7 +5486,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s3, s3, 12 @@ -5509,7 +5513,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: udiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5524,7 +5528,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 12 @@ -5551,7 +5555,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -5570,7 +5574,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101 @@ -5660,42 +5664,42 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_mul_i32 s3, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_sub_i32 s4, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_mul_i32 s1, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 @@ -5716,54 +5720,54 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 -; GFX9-NEXT: s_mul_i32 s7, s6, s3 -; GFX9-NEXT: s_sub_i32 s4, s4, s7 -; GFX9-NEXT: s_add_i32 s9, s6, 1 -; GFX9-NEXT: s_sub_i32 s7, s4, s3 -; GFX9-NEXT: s_cmp_ge_u32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s6, s9, s6 -; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 +; GFX9-NEXT: s_mul_i32 s3, s2, s6 +; GFX9-NEXT: s_sub_i32 s3, s4, s3 +; GFX9-NEXT: s_add_i32 s9, s2, 1 +; GFX9-NEXT: s_sub_i32 s4, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s2, s9, s2 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_add_i32 s4, s2, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s3, s7, s6 -; GFX9-NEXT: s_sub_i32 s4, 0, s2 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 -; GFX9-NEXT: s_mul_i32 s6, s4, s2 -; GFX9-NEXT: s_sub_i32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s7, s4, 1 -; GFX9-NEXT: s_sub_i32 s6, s5, s2 -; GFX9-NEXT: s_cmp_ge_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s3, 0, s7 +; GFX9-NEXT: s_mul_i32 s3, s3, s8 +; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s8, s8, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 +; GFX9-NEXT: s_mul_i32 s4, s3, s7 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s6, s3, 1 +; GFX9-NEXT: s_sub_i32 s5, s4, s7 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s3, s6, s3 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -5780,10 +5784,10 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -5799,19 +5803,19 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: urem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s1, s4, s0 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 20 -; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s3, s4, s2 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, 20 +; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -5826,8 +5830,8 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5838,13 +5842,13 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: urem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, s4, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -5860,7 +5864,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: urem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5875,7 +5879,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 @@ -5903,7 +5907,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: urem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5918,7 +5922,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff @@ -6000,35 +6004,35 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_mul_i32 s3, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_sub_i32 s4, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s3, s4, s3 -; GFX6-NEXT: s_sub_i32 s4, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s4, s4, s3 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_mul_i32 s1, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s4, s4, s1 +; GFX6-NEXT: s_sub_i32 s0, 0, s6 +; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 @@ -6045,55 +6049,56 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cselect_b32 s5, s7, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 -; GFX9-NEXT: s_mul_i32 s6, s6, s3 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 -; GFX9-NEXT: s_sub_i32 s6, s4, s3 -; GFX9-NEXT: s_cmp_ge_u32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_sub_i32 s6, s4, s3 -; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 +; GFX9-NEXT: s_mul_i32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s6 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s6 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s3, s6, s4 -; GFX9-NEXT: s_sub_i32 s4, 0, s2 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 -; GFX9-NEXT: s_mul_i32 s4, s4, s2 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, 0, s7 +; GFX9-NEXT: s_mul_i32 s3, s3, s8 +; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s8, s8, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 +; GFX9-NEXT: s_mul_i32 s3, s3, s7 +; GFX9-NEXT: s_sub_i32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s4, s3, s7 +; GFX9-NEXT: s_cmp_ge_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_sub_i32 s4, s3, s7 +; GFX9-NEXT: s_cmp_ge_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6110,8 +6115,8 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -6126,17 +6131,17 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s1, s0, 31 -; GFX9-NEXT: s_ashr_i32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 +; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s2, 31 +; GFX9-NEXT: s_ashr_i32 s2, s2, 20 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -6151,8 +6156,8 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6166,16 +6171,16 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s4, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s4, s4, s0 -; GFX9-NEXT: s_ashr_i32 s0, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_ashr_i32 s2, s4, 31 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 +; GFX9-NEXT: s_add_i32 s4, s4, s2 +; GFX9-NEXT: s_ashr_i32 s2, s4, 12 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -6191,7 +6196,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: sdiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6234,7 +6239,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 @@ -6289,7 +6294,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: sdiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6310,7 +6315,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 @@ -6343,7 +6348,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -6365,7 +6370,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 @@ -6476,50 +6481,50 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s3, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s6, 0, s3 -; GFX6-NEXT: s_xor_b32 s2, s4, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s1, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX6-NEXT: s_sub_i32 s6, 0, s1 +; GFX6-NEXT: s_xor_b32 s0, s4, s0 ; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX6-NEXT: s_abs_i32 s6, s4 -; GFX6-NEXT: s_ashr_i32 s4, s2, 31 +; GFX6-NEXT: s_ashr_i32 s4, s0, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s2, s6, s2 -; GFX6-NEXT: s_sub_i32 s6, s2, s3 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s6, s0 +; GFX6-NEXT: s_sub_i32 s6, s0, s1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s1 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s2, s6, s2 +; GFX6-NEXT: s_cselect_b32 s0, s6, s0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s1 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: s_abs_i32 s6, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_xor_b32 s7, s5, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: s_xor_b32 s7, s5, s7 ; GFX6-NEXT: s_abs_i32 s5, s5 ; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_ashr_i32 s7, s7, 31 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v2 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 @@ -6539,71 +6544,73 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s3, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s1, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_xor_b32 s0, s4, s0 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX9-NEXT: s_abs_i32 s7, s4 -; GFX9-NEXT: s_xor_b32 s2, s4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: s_ashr_i32 s4, s0, 31 +; GFX9-NEXT: s_sub_i32 s0, 0, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s7, s8 -; GFX9-NEXT: s_mul_i32 s8, s4, s3 +; GFX9-NEXT: s_mul_i32 s0, s0, s8 +; GFX9-NEXT: s_mul_hi_u32 s0, s8, s0 +; GFX9-NEXT: s_add_i32 s8, s8, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s7, s8 +; GFX9-NEXT: s_mul_i32 s8, s0, s1 ; GFX9-NEXT: s_sub_i32 s7, s7, s8 -; GFX9-NEXT: s_add_i32 s9, s4, 1 -; GFX9-NEXT: s_sub_i32 s8, s7, s3 -; GFX9-NEXT: s_cmp_ge_u32 s7, s3 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_add_i32 s9, s0, 1 +; GFX9-NEXT: s_sub_i32 s8, s7, s1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s1 +; GFX9-NEXT: s_cselect_b32 s0, s9, s0 ; GFX9-NEXT: s_cselect_b32 s7, s8, s7 -; GFX9-NEXT: s_add_i32 s8, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s3 -; GFX9-NEXT: s_cselect_b32 s3, s8, s4 -; GFX9-NEXT: s_abs_i32 s4, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s7, 0, s4 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: s_add_i32 s8, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s1 +; GFX9-NEXT: s_cselect_b32 s7, s8, s0 +; GFX9-NEXT: s_abs_i32 s8, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_xor_b32 s2, s5, s6 +; GFX9-NEXT: s_abs_i32 s3, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s5, s6 -; GFX9-NEXT: s_abs_i32 s5, s5 -; GFX9-NEXT: s_ashr_i32 s6, s6, 31 +; GFX9-NEXT: s_xor_b32 s5, s7, s4 +; GFX9-NEXT: s_sub_i32 s6, 0, s8 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s7, s7, s3 -; GFX9-NEXT: s_mul_hi_u32 s7, s3, s7 -; GFX9-NEXT: s_add_i32 s3, s3, s7 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 -; GFX9-NEXT: s_mul_i32 s7, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, s5, s7 -; GFX9-NEXT: s_add_i32 s8, s3, 1 -; GFX9-NEXT: s_sub_i32 s7, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_cselect_b32 s5, s7, s5 -; GFX9-NEXT: s_add_i32 s7, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s3, s7, s3 -; GFX9-NEXT: s_xor_b32 s3, s3, s6 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 +; GFX9-NEXT: s_add_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX9-NEXT: s_mul_i32 s6, s5, s8 ; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_add_i32 s7, s5, 1 +; GFX9-NEXT: s_sub_i32 s6, s3, s8 +; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_cselect_b32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s3, s6, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6620,9 +6627,9 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6640,19 +6647,19 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: srem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s1, s0, 31 -; GFX9-NEXT: s_ashr_i32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 +; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s2, 31 +; GFX9-NEXT: s_ashr_i32 s2, s2, 20 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -6667,8 +6674,8 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6683,17 +6690,17 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: srem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s4, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s0, s4, s0 -; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_ashr_i32 s2, s4, 31 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 +; GFX9-NEXT: s_add_i32 s2, s4, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -6709,7 +6716,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: srem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 @@ -6746,7 +6753,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 @@ -6798,7 +6805,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: srem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6821,7 +6828,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 @@ -6927,44 +6934,44 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s2, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s0, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_abs_i32 s3, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: s_abs_i32 s1, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s2 -; GFX6-NEXT: s_sub_i32 s3, s3, s7 -; GFX6-NEXT: s_sub_i32 s7, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s3, s7, s3 -; GFX6-NEXT: s_sub_i32 s7, s3, s2 -; GFX6-NEXT: s_cmp_ge_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s7, s7, s3 +; GFX6-NEXT: s_mul_i32 s7, s7, s0 +; GFX6-NEXT: s_sub_i32 s1, s1, s7 +; GFX6-NEXT: s_sub_i32 s7, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s1, s7, s1 +; GFX6-NEXT: s_sub_i32 s7, s1, s0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_cselect_b32 s7, s7, s1 ; GFX6-NEXT: s_abs_i32 s6, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: s_sub_i32 s0, 0, s6 ; GFX6-NEXT: s_abs_i32 s8, s5 ; GFX6-NEXT: s_xor_b32 s7, s7, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_sub_i32 s4, s7, s4 ; GFX6-NEXT: s_ashr_i32 s5, s5, 31 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -6982,20 +6989,20 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_sub_i32 s5, s6, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s2, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s7 -; GFX9-NEXT: s_sub_i32 s7, 0, s2 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s0, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_lshl_b32 s1, 0x1000, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, s0 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_abs_i32 s4, s4 @@ -7006,41 +7013,43 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 ; GFX9-NEXT: s_add_i32 s8, s8, s7 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 -; GFX9-NEXT: s_mul_i32 s7, s7, s2 +; GFX9-NEXT: s_mul_i32 s7, s7, s0 ; GFX9-NEXT: s_sub_i32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s7, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_sub_i32 s7, s4, s0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 ; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_sub_i32 s7, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s7, s4 -; GFX9-NEXT: s_abs_i32 s3, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_xor_b32 s2, s2, s6 -; GFX9-NEXT: s_sub_i32 s7, 0, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s7, s4, s0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_abs_i32 s7, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s4, s5, 31 -; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: s_abs_i32 s3, s5 +; GFX9-NEXT: s_sub_i32 s5, 0, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 -; GFX9-NEXT: s_mul_i32 s6, s6, s3 -; GFX9-NEXT: s_sub_i32 s5, s5, s6 -; GFX9-NEXT: s_sub_i32 s6, s5, s3 -; GFX9-NEXT: s_cmp_ge_u32 s5, s3 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s3 -; GFX9-NEXT: s_cmp_ge_u32 s5, s3 -; GFX9-NEXT: s_cselect_b32 s3, s6, s5 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s7 +; GFX9-NEXT: s_sub_i32 s3, s3, s5 +; GFX9-NEXT: s_sub_i32 s5, s3, s7 +; GFX9-NEXT: s_cmp_ge_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s7 +; GFX9-NEXT: s_cmp_ge_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -7069,7 +7078,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 ; GFX6-NEXT: s_mul_i32 s6, s5, 0x68958c89 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -7154,11 +7163,11 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 s0, 3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0f6 ; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: s_addc_u32 s0, s1, 0 @@ -7267,7 +7276,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: udiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7281,7 +7290,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 @@ -7303,8 +7312,8 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: udiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7319,12 +7328,12 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: udiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s2, 12 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 +; GFX9-NEXT: s_add_i32 s0, s0, 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -7348,8 +7357,8 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: udiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7364,17 +7373,17 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: udiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7394,8 +7403,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01 ; GFX6-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7481,13 +7490,13 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 ; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 @@ -7570,9 +7579,9 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7595,27 +7604,27 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s8, s8, 12 -; GFX6-NEXT: s_add_i32 s9, s10, 12 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_add_i32 s0, s8, 12 +; GFX6-NEXT: s_add_i32 s2, s10, 12 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s8, 12 ; GFX9-NEXT: s_add_i32 s8, s10, 12 @@ -7641,12 +7650,12 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_add_u32 s0, 4, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xe3e0fc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_addc_u32 s1, 0, 0 ; GFX6-NEXT: s_or_b32 s0, vcc_lo, vcc_hi +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_mov_b32 s0, 0x689e0837 ; GFX6-NEXT: s_movk_i32 s2, 0xfee0 @@ -7737,11 +7746,11 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 s0, 4, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0fc ; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: s_addc_u32 s0, s1, 0 @@ -7848,7 +7857,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 @@ -7862,7 +7871,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff @@ -7883,8 +7892,8 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: urem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7901,11 +7910,11 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: urem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 ; GFX9-NEXT: s_add_u32 s0, s0, -1 ; GFX9-NEXT: s_addc_u32 s1, s1, -1 ; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] @@ -7932,8 +7941,8 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: urem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -7948,16 +7957,16 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: urem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, 0xfff -; GFX9-NEXT: s_and_b32 s1, s6, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] +; GFX9-NEXT: s_and_b32 s2, s4, 0xfff +; GFX9-NEXT: s_and_b32 s3, s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7980,31 +7989,31 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX6-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 -; GFX6-NEXT: s_add_u32 s8, s8, -1 -; GFX6-NEXT: s_addc_u32 s9, s9, -1 -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] -; GFX6-NEXT: s_add_u32 s8, s10, -1 -; GFX6-NEXT: s_addc_u32 s9, s11, -1 -; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s10 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX6-NEXT: s_add_u32 s2, s2, -1 +; GFX6-NEXT: s_addc_u32 s3, s3, -1 +; GFX6-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; GFX6-NEXT: s_add_u32 s0, s0, -1 +; GFX6-NEXT: s_addc_u32 s1, s1, -1 +; GFX6-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s10 ; GFX9-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 @@ -8034,7 +8043,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x33fe64 ; GFX6-NEXT: s_add_u32 s1, 0x396, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x28100000 @@ -8150,7 +8159,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 ; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 ; GFX9-NEXT: s_addc_u32 s6, s8, s9 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 @@ -8234,7 +8243,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8252,7 +8261,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: sdiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -8278,21 +8287,21 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s8 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX6-NEXT: s_ashr_i32 s8, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s3, s3, s8 -; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] +; GFX6-NEXT: s_addc_u32 s1, s1, s8 +; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX6-NEXT: s_sub_u32 s4, 0, s10 ; GFX6-NEXT: s_subb_u32 s5, 0, s11 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8414,19 +8423,19 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s2 -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s4, s4, s2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s5, s5, s2 -; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_sub_u32 s0, 0, s8 -; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX9-NEXT: s_ashr_i32 s8, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s1, s8 +; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GFX9-NEXT: s_sub_u32 s0, 0, s10 +; GFX9-NEXT: s_subb_u32 s1, 0, s11 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8436,61 +8445,60 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s10, v2 -; GFX9-NEXT: v_readfirstlane_b32 s11, v1 -; GFX9-NEXT: s_mul_i32 s12, s0, s10 -; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11 -; GFX9-NEXT: s_mul_i32 s13, s1, s11 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: s_mul_i32 s12, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s14, s0, s3 +; GFX9-NEXT: s_mul_i32 s13, s1, s3 ; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s0, s11 +; GFX9-NEXT: s_mul_i32 s15, s0, s3 ; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 -; GFX9-NEXT: s_mul_i32 s11, s11, s12 -; GFX9-NEXT: s_add_u32 s11, s14, s11 +; GFX9-NEXT: s_mul_hi_u32 s14, s3, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s3, s12 +; GFX9-NEXT: s_mul_i32 s3, s3, s12 +; GFX9-NEXT: s_add_u32 s3, s14, s3 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15 -; GFX9-NEXT: s_mul_i32 s15, s10, s15 -; GFX9-NEXT: s_add_u32 s11, s11, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12 -; GFX9-NEXT: s_addc_u32 s11, s13, s16 +; GFX9-NEXT: s_mul_hi_u32 s16, s2, s15 +; GFX9-NEXT: s_mul_i32 s15, s2, s15 +; GFX9-NEXT: s_add_u32 s3, s3, s15 +; GFX9-NEXT: s_mul_hi_u32 s14, s2, s12 +; GFX9-NEXT: s_addc_u32 s3, s13, s16 ; GFX9-NEXT: s_addc_u32 s13, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_mul_i32 s12, s2, s12 +; GFX9-NEXT: s_add_u32 s3, s3, s12 ; GFX9-NEXT: s_addc_u32 s12, 0, s13 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s11, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s10, s10, s12 +; GFX9-NEXT: s_addc_u32 s2, s2, s12 ; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: s_mul_i32 s11, s0, s10 +; GFX9-NEXT: s_mul_i32 s3, s0, s2 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s11, s13, s11 +; GFX9-NEXT: s_add_i32 s3, s13, s3 ; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s11, s11, s1 +; GFX9-NEXT: s_add_i32 s3, s3, s1 ; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 -; GFX9-NEXT: s_mul_i32 s14, s10, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s2, s0 +; GFX9-NEXT: s_mul_i32 s14, s2, s0 +; GFX9-NEXT: s_mul_i32 s16, s12, s3 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s16 ; GFX9-NEXT: s_addc_u32 s12, 0, s15 ; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 ; GFX9-NEXT: s_addc_u32 s0, s12, s13 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s11, s10, s11 -; GFX9-NEXT: s_add_u32 s0, s0, s11 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_add_u32 s0, s0, s3 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s12, s10, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX9-NEXT: s_addc_u32 s12, s2, s1 +; GFX9-NEXT: s_ashr_i32 s2, s7, 31 +; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s1, s7, s2 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s13, v1 ; GFX9-NEXT: s_mul_i32 s1, s6, s12 ; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 @@ -8506,24 +8514,24 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s12, s7, s12 ; GFX9-NEXT: s_add_u32 s12, s0, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s12 +; GFX9-NEXT: s_mul_i32 s0, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s1, s10, s12 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s9, s12 +; GFX9-NEXT: s_mul_i32 s1, s11, s12 ; GFX9-NEXT: s_add_i32 s14, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s8, s12 +; GFX9-NEXT: s_mul_i32 s1, s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_sub_i32 s0, s7, s14 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s6, s0, s9 -; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s8, v1 +; GFX9-NEXT: s_subb_u32 s6, s0, s11 +; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s10, v1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s6, s6, 0 -; GFX9-NEXT: s_cmp_ge_u32 s6, s9 +; GFX9-NEXT: s_cmp_ge_u32 s6, s11 ; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 -; GFX9-NEXT: s_cmp_eq_u32 s6, s9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 +; GFX9-NEXT: s_cmp_eq_u32 s6, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -8541,10 +8549,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: s_subb_u32 s0, s7, s14 -; GFX9-NEXT: s_cmp_ge_u32 s0, s9 +; GFX9-NEXT: s_cmp_ge_u32 s0, s11 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s9 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v1 +; GFX9-NEXT: s_cmp_eq_u32 s0, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -8554,7 +8562,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 @@ -8581,8 +8589,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: sdiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8605,25 +8613,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: sdiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s5, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, 0 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 ; GFX9-NEXT: s_addc_u32 s5, s7, 0 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8643,8 +8651,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01 ; GFX6-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -8744,17 +8752,17 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s5, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, 0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, 0 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 ; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 @@ -8838,11 +8846,11 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: s_sub_u32 s5, s6, s4 ; GFX9-NEXT: s_subb_u32 s4, s7, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8865,37 +8873,36 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 -; GFX6-NEXT: s_lshl_b64 s[12:13], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s14, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s14 -; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: s_addc_u32 s3, s3, s14 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[14:15] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_sub_u32 s10, 0, s2 -; GFX6-NEXT: s_subb_u32 s11, 0, s3 -; GFX6-NEXT: s_ashr_i32 s16, s5, 31 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[14:15], 0x1000, s10 +; GFX6-NEXT: s_ashr_i32 s16, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s16 +; GFX6-NEXT: s_mov_b32 s17, s16 +; GFX6-NEXT: s_addc_u32 s1, s1, s16 +; GFX6-NEXT: s_xor_b64 s[12:13], s[0:1], s[16:17] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX6-NEXT: s_sub_u32 s0, 0, s12 +; GFX6-NEXT: s_subb_u32 s1, 0, s13 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GFX6-NEXT: s_add_u32 s0, s4, s16 -; GFX6-NEXT: s_mov_b32 s17, s16 +; GFX6-NEXT: s_ashr_i32 s2, s5, 31 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_addc_u32 s1, s5, s16 -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -8914,12 +8921,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -8935,8 +8941,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: s_add_u32 s0, s4, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_addc_u32 s1, s5, s2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -8946,29 +8955,28 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s13 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -8977,23 +8985,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[14:15] -; GFX6-NEXT: s_ashr_i32 s4, s13, 31 -; GFX6-NEXT: s_add_u32 s12, s12, s4 +; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[16:17] +; GFX6-NEXT: s_ashr_i32 s2, s15, 31 +; GFX6-NEXT: s_add_u32 s4, s14, s2 ; GFX6-NEXT: v_mov_b32_e32 v6, s5 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s13, s13, s4 -; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s5, s15, s2 +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 ; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 @@ -9002,16 +9010,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: s_sub_u32 s2, 0, s12 +; GFX6-NEXT: s_sub_u32 s12, 0, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX6-NEXT: s_subb_u32 s3, 0, s13 -; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v3 +; GFX6-NEXT: s_subb_u32 s13, 0, s5 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 @@ -9030,11 +9038,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 @@ -9049,14 +9057,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: s_ashr_i32 s2, s7, 31 +; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s6, s6, s2 +; GFX6-NEXT: s_add_u32 s6, s6, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s7, s7, s2 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s7, s7, s12 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 @@ -9072,25 +9080,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s4, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s5, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s13 +; GFX6-NEXT: v_mov_b32_e32 v7, s5 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s4, v5 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] @@ -9101,15 +9109,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 @@ -9122,19 +9130,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s3, 31 -; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: s_ashr_i32 s8, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s3, s3, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] +; GFX9-NEXT: s_addc_u32 s1, s1, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 @@ -9408,7 +9416,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y @@ -9425,7 +9432,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x33fe64 ; GFX6-NEXT: s_add_u32 s0, 0x396, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x28100000 @@ -9445,6 +9451,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, s1, v2 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v3, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc @@ -9539,7 +9546,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 ; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 ; GFX9-NEXT: s_addc_u32 s6, s8, s9 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 @@ -9626,7 +9633,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9646,7 +9653,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: srem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -9674,21 +9681,21 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX6-NEXT: s_ashr_i32 s4, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s3, s3, s4 -; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] +; GFX6-NEXT: s_addc_u32 s1, s1, s4 +; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: s_sub_u32 s4, 0, s8 ; GFX6-NEXT: s_subb_u32 s5, 0, s9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9808,17 +9815,17 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s3, s3, s4 -; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] +; GFX9-NEXT: s_addc_u32 s1, s1, s4 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s8 ; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 @@ -9972,8 +9979,8 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: srem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10000,17 +10007,17 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: srem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s5, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX9-NEXT: s_sub_u32 s0, s4, s0 -; GFX9-NEXT: s_subb_u32 s1, s5, s1 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, 0 +; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s2, s4, s2 +; GFX9-NEXT: s_subb_u32 s3, s5, s3 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 @@ -10018,11 +10025,11 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 ; GFX9-NEXT: s_sub_u32 s4, s6, s4 ; GFX9-NEXT: s_subb_u32 s5, s7, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -10045,39 +10052,36 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 ; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s8 +; GFX6-NEXT: s_ashr_i32 s8, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s3, s3, s8 -; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] +; GFX6-NEXT: s_addc_u32 s1, s1, s8 +; GFX6-NEXT: s_xor_b64 s[14:15], s[0:1], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX6-NEXT: s_sub_u32 s2, 0, s14 -; GFX6-NEXT: s_subb_u32 s3, 0, s15 +; GFX6-NEXT: s_sub_u32 s0, 0, s14 +; GFX6-NEXT: s_subb_u32 s1, 0, s15 ; GFX6-NEXT: s_ashr_i32 s12, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_addc_u32 s1, s5, s12 -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -10096,11 +10100,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -10116,8 +10120,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_addc_u32 s1, s5, s12 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -10298,19 +10305,19 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: srem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s3, 31 -; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: s_ashr_i32 s8, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s3, s3, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] +; GFX9-NEXT: s_addc_u32 s1, s1, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll index 9f5b6389ab59f..52e76dd24a20b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll @@ -6,7 +6,7 @@ define weak_odr amdgpu_kernel void @test_mul24_knownbits_kernel(ptr addrspace(1) ; GCN-LABEL: test_mul24_knownbits_kernel: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: v_mul_i32_i24_e32 v0, -5, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffffe0, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll index a35fbaadddf9e..1358d91ae102c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll @@ -240,7 +240,7 @@ entry: define void @sincos_v2f32_nocontract(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f32_nocontract -; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -298,7 +298,7 @@ entry: define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f32 -; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -317,7 +317,7 @@ entry: define void @sincos_v3f32(<3 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v3f32 -; CHECK-SAME: (<3 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<3 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <3 x float>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <3 x float> @_Z6sincosDv3_fPU3AS5S_(<3 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -340,7 +340,7 @@ entry: define void @sincos_v4f32(<4 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v4f32 -; CHECK-SAME: (<4 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<4 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <4 x float>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <4 x float> @_Z6sincosDv4_fPU3AS5S_(<4 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -359,7 +359,7 @@ entry: define void @sincos_v8f32(<8 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v8f32 -; CHECK-SAME: (<8 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<8 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <8 x float>, align 32, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <8 x float> @_Z6sincosDv8_fPU3AS5S_(<8 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -378,7 +378,7 @@ entry: define void @sincos_v16f32(<16 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v16f32 -; CHECK-SAME: (<16 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<16 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <16 x float>, align 64, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <16 x float> @_Z6sincosDv16_fPU3AS5S_(<16 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -397,7 +397,7 @@ entry: define void @sincos_f64_nocontract(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f64_nocontract -; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -417,7 +417,7 @@ entry: define void @sincos_v2f64_nocontract(<2 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f64_nocontract -; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x double>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @_Z6sincosDv2_dPU3AS5S_(<2 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -436,7 +436,7 @@ entry: define void @sincos_f64(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f64 -; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -455,7 +455,7 @@ entry: define void @sincos_f64_order1(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f64_order1 -; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -474,7 +474,7 @@ entry: define void @sincos_v2f64(<2 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f64 -; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x double>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x double> @_Z6sincosDv2_dPU3AS5S_(<2 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -493,7 +493,7 @@ entry: define void @sincos_v3f64(<3 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v3f64 -; CHECK-SAME: (<3 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<3 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <3 x double>, align 32, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <3 x double> @_Z6sincosDv3_dPU3AS5S_(<3 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -516,7 +516,7 @@ entry: define void @sincos_v4f64(<4 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v4f64 -; CHECK-SAME: (<4 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<4 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <4 x double>, align 32, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <4 x double> @_Z6sincosDv4_dPU3AS5S_(<4 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -535,7 +535,7 @@ entry: define void @sincos_v8f64(<8 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v8f64 -; CHECK-SAME: (<8 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<8 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <8 x double>, align 64, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <8 x double> @_Z6sincosDv8_dPU3AS5S_(<8 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -554,7 +554,7 @@ entry: define void @sincos_v16f64(<16 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v16f64 -; CHECK-SAME: (<16 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<16 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <16 x double>, align 128, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <16 x double> @_Z6sincosDv16_dPU3AS5S_(<16 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -666,7 +666,7 @@ bb1: define float @select_sin_or_cos_f32(i1 %cond, float %x) { ; CHECK-LABEL: define float @select_sin_or_cos_f32 -; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { +; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -685,7 +685,7 @@ declare void @func(ptr addrspace(1)) define void @sincos_f32_value_is_instr(ptr addrspace(1) %value.ptr, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f32_value_is_instr -; CHECK-SAME: (ptr addrspace(1) [[VALUE_PTR:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (ptr addrspace(1) [[VALUE_PTR:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: tail call void @func(ptr addrspace(1) [[VALUE_PTR]]) @@ -838,7 +838,7 @@ entry: define void @sincos_v2f32_flag_intersect1(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f32_flag_intersect1 -; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { +; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call nnan contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -859,7 +859,7 @@ declare void @use_stack_ptrs(ptr addrspace(5), ptr addrspace(5)) define void @sincos_f32_alloca_insertpt(float %x) { ; CHECK-LABEL: define void @sincos_f32_alloca_insertpt -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5) ; CHECK-NEXT: [[ALLOCA1:%.*]] = alloca i32, align 4, addrspace(5) @@ -884,7 +884,7 @@ entry: define float @sincos_f32_unused_result_cos(float %x) { ; CHECK-LABEL: define float @sincos_f32_unused_result_cos -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SIN:%.*]] = tail call contract float @_Z3sinf(float [[X]]) ; CHECK-NEXT: ret float [[SIN]] @@ -899,7 +899,7 @@ entry: define float @sincos_f32_unused_result_sin(float %x) { ; CHECK-LABEL: define float @sincos_f32_unused_result_sin -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5]] { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR6]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[COS:%.*]] = tail call contract float @_Z3cosf(float [[X]]) ; CHECK-NEXT: ret float [[COS]] @@ -914,7 +914,7 @@ entry: define void @sincos_f32_repeated_uses(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) { ; CHECK-LABEL: define void @sincos_f32_repeated_uses -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -947,7 +947,7 @@ entry: define void @sin_f32_indirect_call_user(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out, ptr %func.ptr) { ; CHECK-LABEL: define void @sin_f32_indirect_call_user -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]]) ; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4 @@ -965,7 +965,7 @@ entry: define void @cos_f32_indirect_call_user(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out, ptr %func.ptr) { ; CHECK-LABEL: define void @cos_f32_indirect_call_user -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr #[[ATTR4]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3cosf(float [[X]]) ; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[COS_OUT]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index bd61558905f63..9ec8e425a3f55 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -37,9 +37,9 @@ ; by 4 bytes. ; HSA-ALLOCA: .amdhsa_private_segment_fixed_size 24 -; HSA-ALLOCA: s_add_i32 s6, s6, s9 -; HSA-ALLOCA: s_mov_b32 flat_scratch_lo, s7 -; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-ALLOCA: s_add_i32 s12, s12, s17 +; HSA-ALLOCA-DAG: s_mov_b32 flat_scratch_lo, s13 +; HSA-ALLOCA-DAG: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll index cc116dfe807ec..8cda553e61c8a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll @@ -9,8 +9,8 @@ ; Legacy intrinsics that just read implicit parameters ; FUNC-LABEL: {{^}}ngroups_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x0 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x0 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -24,8 +24,8 @@ entry: } ; FUNC-LABEL: {{^}}ngroups_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x1 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x4 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -39,8 +39,8 @@ entry: } ; FUNC-LABEL: {{^}}ngroups_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x2 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x8 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -54,8 +54,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x3 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0xc ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -69,8 +69,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x4 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x10 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -84,8 +84,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x5 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x14 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -99,8 +99,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x6 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x18 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -114,8 +114,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x7 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x1c ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -129,8 +129,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x8 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x20 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll index 87084d780410b..91abbfff7f2de 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll @@ -23,7 +23,7 @@ ; ELF: Section: .text (0x2) ; ELF: } -; GFX10: NumSGPRsForWavesPerEU: 2 +; GFX10: NumSGPRsForWavesPerEU: 4 ; GFX10: NumVGPRsForWavesPerEU: 1 define amdgpu_kernel void @simple(ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll index 897e134ee48d8..1f8da18cdd301 100644 --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -9,8 +9,8 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; GCN-LABEL: anyext_i1_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -22,8 +22,8 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; ; GFX8-LABEL: anyext_i1_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -37,17 +37,17 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; ; GFX9-LABEL: anyext_i1_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tmp = icmp eq i32 %cond, 0 @@ -62,8 +62,8 @@ entry: define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { ; GCN-LABEL: s_anyext_i16_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s14, 0 ; GCN-NEXT: s_mov_b32 s15, s11 @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: s_anyext_i16_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s7 @@ -113,13 +113,13 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: s_anyext_i16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] +; GFX9-NEXT: global_load_ushort v3, v1, s[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 624101dc12c5f..fb764560154d5 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -24,18 +24,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -52,18 +52,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -80,18 +80,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -107,10 +107,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -118,9 +118,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -130,24 +131,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -158,7 +160,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -166,7 +168,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -174,8 +176,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -189,24 +191,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -221,7 +223,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -229,7 +231,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -237,8 +239,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -252,24 +254,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: add_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -290,23 +292,23 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -319,24 +321,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -349,24 +351,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -378,16 +380,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -395,9 +397,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -407,37 +410,37 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -445,7 +448,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -453,8 +456,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -468,41 +471,41 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -510,7 +513,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -518,8 +521,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -533,32 +536,32 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -571,8 +574,8 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -583,36 +586,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -624,36 +627,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -664,37 +667,38 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -704,36 +708,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 -; GFX10W32-NEXT: s_add_i32 s2, s2, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -743,174 +748,182 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX11W32-NEXT: s_add_i32 s2, s2, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB2_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 +; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 +; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 +; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -924,11 +937,11 @@ entry: define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) { ; GFX6-LABEL: struct_add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x11 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s8, s[2:3], 0x11 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 ; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -938,38 +951,38 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX8-LABEL: struct_add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB3_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x44 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dword s5, s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB3_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -981,38 +994,38 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB3_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB3_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1023,40 +1036,41 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX10W64-LABEL: struct_add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44 -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dword s5, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mov_b32_e32 v2, s5 ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -1066,39 +1080,40 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 -; GFX10W32-NEXT: s_add_i32 s2, s2, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_load_dword s8, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mov_b32_e32 v2, s8 ; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -1108,186 +1123,192 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44 -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: s_load_b32 s5, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s5 -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB3_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: struct_add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX11W32-NEXT: s_add_i32 s2, s2, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: s_load_b32 s8, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mov_b32_e32 v2, s8 -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB3_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 +; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: struct_add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b32 s5, s[0:1], 0x44 -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX12W64-NEXT: s_load_b32 s5, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, s5 -; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB3_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 +; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: struct_add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b32 s8, s[0:1], 0x44 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX12W32-NEXT: s_load_b32 s8, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_mov_b32_e32 v2, s8 -; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB3_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 +; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1301,8 +1322,8 @@ entry: define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc @@ -1314,9 +1335,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1327,9 +1348,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1339,9 +1360,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1349,33 +1371,67 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_i32_varying_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: add_i32_varying_offset: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11W64-LABEL: add_i32_varying_offset: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_varying_offset: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm +; +; GFX12W64-LABEL: add_i32_varying_offset: +; GFX12W64: ; %bb.0: ; %entry +; GFX12W64-NEXT: s_clause 0x1 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: s_nop 0 +; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W64-NEXT: s_endpgm +; +; GFX12W32-LABEL: add_i32_varying_offset: +; GFX12W32: ; %bb.0: ; %entry +; GFX12W32-NEXT: s_clause 0x1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: s_nop 0 +; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) @@ -1391,18 +1447,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1420,18 +1476,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1449,18 +1505,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1477,10 +1533,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -1488,9 +1544,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1501,24 +1558,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1530,7 +1588,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1538,7 +1596,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1546,8 +1604,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1562,24 +1620,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1595,7 +1653,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1603,7 +1661,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1611,8 +1669,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1627,24 +1685,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: sub_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1666,23 +1724,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB6_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1695,24 +1753,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1725,24 +1783,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1754,16 +1812,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1771,8 +1829,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1784,38 +1842,38 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1823,7 +1881,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1831,8 +1889,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1847,42 +1905,42 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1890,7 +1948,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1898,8 +1956,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1914,33 +1972,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1953,8 +2011,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -1965,36 +2023,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB7_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB7_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -2006,36 +2064,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB7_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2046,37 +2104,38 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -2086,36 +2145,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 -; GFX10W32-NEXT: s_add_i32 s2, s2, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -2125,176 +2185,184 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB7_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX11W32-NEXT: s_add_i32 s2, s2, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB7_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB7_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -2308,8 +2376,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc @@ -2321,9 +2389,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2334,9 +2402,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2346,9 +2414,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2356,36 +2425,73 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: sub_i32_varying_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: sub_i32_varying_offset: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11W64-LABEL: sub_i32_varying_offset: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_varying_offset: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm +; +; GFX12W64-LABEL: sub_i32_varying_offset: +; GFX12W64: ; %bb.0: ; %entry +; GFX12W64-NEXT: s_clause 0x1 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: s_nop 0 +; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W64-NEXT: s_endpgm +; +; GFX12W32-LABEL: sub_i32_varying_offset: +; GFX12W32: ; %bb.0: ; %entry +; GFX12W32-NEXT: s_clause 0x1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: s_nop 0 +; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index d3944d3d52d77..ca4812f345958 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -17,7 +17,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX89-LABEL: add_i32_constant: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b64 s[6:7], exec ; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -80,7 +80,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -114,7 +114,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -184,7 +184,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -220,7 +220,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: add_i32_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -256,7 +256,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: add_i32_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232-NEXT: s_mov_b32 s5, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -297,25 +297,25 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) { ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s8, s6 +; GFX7LESS-NEXT: s_mov_b32 s9, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB1_2: @@ -324,36 +324,36 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s2, s8, s2 +; GFX8-NEXT: s_mul_i32 s0, s8, s0 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 @@ -365,29 +365,29 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s2, s8, s2 +; GFX9-NEXT: s_mul_i32 s0, s8, s0 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 @@ -400,20 +400,20 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s8, s2 +; GFX1064-NEXT: s_mul_i32 s2, s10, s2 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b32 s12, s6 @@ -429,28 +429,28 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v0, s[0:1] ; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_mul_i32 s2, s0, s2 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc @@ -459,38 +459,38 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] ; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 +; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s2, s8, s2 -; GFX1164-NEXT: s_mov_b32 s14, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc +; GFX1164-NEXT: s_mul_i32 s3, s2, s3 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-NEXT: s_mov_b32 s8, s6 +; GFX1164-NEXT: s_mov_b32 s9, s7 +; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv @@ -501,7 +501,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s6, -1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] +; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1] ; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -510,17 +510,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: add_i32_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX1132-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 @@ -548,26 +548,26 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: add_i32_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1264-NEXT: s_load_b32 s8, s[0:1], 0x34 -; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 +; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264-NEXT: s_cbranch_execz .LBB1_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s2, s8, s2 -; GFX1264-NEXT: s_mov_b32 s14, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s2 -; GFX1264-NEXT: s_mov_b32 s12, s6 -; GFX1264-NEXT: s_mov_b32 s13, s7 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_mul_i32 s3, s2, s3 +; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-NEXT: s_mov_b32 s8, s6 +; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB1_2: @@ -577,7 +577,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s8, v0, s[0:1] +; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1] ; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1264-NEXT: s_nop 0 ; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -586,17 +586,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: add_i32_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1232-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX1232-NEXT: s_mov_b32 s2, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB1_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_i32 s2, s0, s2 @@ -628,7 +628,7 @@ entry: define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS-LABEL: add_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s10, s6 @@ -646,22 +646,22 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s4 ; GFX8-NEXT: v_readlane_b32 s7, v0, s4 ; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8-NEXT: v_writelane_b32 v1, s6, m0 ; GFX8-NEXT: s_add_i32 s6, s6, s7 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -691,22 +691,22 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s4 ; GFX9-NEXT: v_readlane_b32 s7, v0, s4 ; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9-NEXT: v_writelane_b32 v1, s6, m0 ; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -736,21 +736,21 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s6, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s7, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7 ; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064-NEXT: s_add_i32 s6, s6, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -782,21 +782,21 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s3 -; GFX1032-NEXT: v_writelane_b32 v1, s4, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s6 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s6 ; GFX1032-NEXT: s_add_i32 s4, s4, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 @@ -827,49 +827,51 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s6, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s7 ; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1164-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_add_i32 s6, s6, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164-NEXT: s_cbranch_execz .LBB2_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s8, s2 ; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB2_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 @@ -878,47 +880,49 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s3 -; GFX1132-NEXT: v_writelane_b32 v1, s4, s3 -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6 +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1132-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: s_add_i32 s4, s4, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1132-NEXT: s_cbranch_execz .LBB2_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_mov_b32_e32 v1, s4 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s8, s2 ; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB2_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 @@ -927,48 +931,50 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264-LABEL: add_i32_varying: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1264-NEXT: s_mov_b64 s[0:1], exec ; GFX1264-NEXT: s_mov_b32 s6, 0 -; GFX1264-NEXT: ; implicit-def: $vgpr1 +; GFX1264-NEXT: ; implicit-def: $vgpr0 ; GFX1264-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3] -; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1264-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1264-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264-NEXT: v_readlane_b32 s8, v1, s7 ; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1264-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1264-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1264-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1264-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1264-NEXT: ; implicit-def: $vgpr0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264-NEXT: s_cbranch_execz .LBB2_4 ; GFX1264-NEXT: ; %bb.3: -; GFX1264-NEXT: v_mov_b32_e32 v0, s6 +; GFX1264-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB2_4: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1264-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264-NEXT: s_nop 0 @@ -977,46 +983,48 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232-LABEL: add_i32_varying: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_mov_b32 s2, exec_lo +; GFX1232-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1232-NEXT: s_mov_b32 s0, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, 0 -; GFX1232-NEXT: ; implicit-def: $vgpr1 +; GFX1232-NEXT: ; implicit-def: $vgpr0 ; GFX1232-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1232-NEXT: v_readlane_b32 s5, v0, s3 -; GFX1232-NEXT: s_lshl_b32 s6, 1, s3 -; GFX1232-NEXT: v_writelane_b32 v1, s4, s3 -; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6 +; GFX1232-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1232-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1232-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232-NEXT: s_add_co_i32 s4, s4, s5 -; GFX1232-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1232-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1232-NEXT: ; implicit-def: $vgpr0 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1232-NEXT: s_cbranch_execz .LBB2_4 ; GFX1232-NEXT: ; %bb.3: -; GFX1232-NEXT: v_mov_b32_e32 v0, s4 +; GFX1232-NEXT: v_mov_b32_e32 v1, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB2_4: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1232-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_nop 0 @@ -1033,7 +1041,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1071,7 +1079,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX89-LABEL: add_i64_constant: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b64 s[6:7], exec ; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1108,7 +1116,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1144,7 +1152,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1179,7 +1187,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1218,7 +1226,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1255,7 +1263,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: add_i64_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1294,7 +1302,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: add_i64_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: s_mov_b32 s5, 0 ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 @@ -1338,8 +1346,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1382,8 +1390,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 @@ -1422,24 +1430,24 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX9-NEXT: s_mul_i32 s7, s1, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s0, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: s_mul_i32 s6, s0, s6 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -1448,38 +1456,38 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s1, v2, v[1:2] ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i64_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s3, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 -; GFX1064-NEXT: s_mul_i32 s8, s2, s8 +; GFX1064-NEXT: s_mul_i32 s9, s1, s8 +; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s8 +; GFX1064-NEXT: s_mul_i32 s8, s0, s8 ; GFX1064-NEXT: s_add_i32 s10, s10, s9 ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 @@ -1492,37 +1500,37 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1] -; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v2, s[2:3] +; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s1, v2, v[1:2] ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i64_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s3, s1 -; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_mul_i32 s8, s1, s3 +; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s3 +; GFX1032-NEXT: s_mul_i32 s3, s0, s3 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_mov_b32 s8, s6 @@ -1533,22 +1541,22 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1] -; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2] +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v2, s[2:3] +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s1, v2, v[1:2] ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i64_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -1594,17 +1602,17 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: add_i64_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1132-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s8 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s8, s1, s3 @@ -1640,8 +1648,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: add_i64_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -1682,18 +1690,18 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: add_i64_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX1232-NEXT: s_mov_b32 s2, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1232-NEXT: s_mov_b32 s9, exec_lo ; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB4_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] @@ -1727,7 +1735,7 @@ entry: define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS-LABEL: add_i64_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -1746,7 +1754,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX89-LABEL: add_i64_varying: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -1765,7 +1773,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: add_i64_varying: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 @@ -1783,48 +1791,93 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_i64_varying: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: add_i64_varying: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX1164-LABEL: add_i64_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_mov_b32 s11, s7 +; GFX1164-NEXT: s_mov_b32 s10, s6 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 +; GFX1164-NEXT: s_mov_b32 s4, s0 +; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: s_mov_b32 s5, s1 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i64_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_mov_b32 s11, s7 +; GFX1132-NEXT: s_mov_b32 s10, s6 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 +; GFX1132-NEXT: s_mov_b32 s4, s0 +; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: s_mov_b32 s5, s1 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm +; +; GFX1264-LABEL: add_i64_varying: +; GFX1264: ; %bb.0: ; %entry +; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s6, -1 +; GFX1264-NEXT: s_mov_b32 s11, s7 +; GFX1264-NEXT: s_mov_b32 s10, s6 +; GFX1264-NEXT: s_wait_kmcnt 0x0 +; GFX1264-NEXT: s_mov_b32 s8, s2 +; GFX1264-NEXT: s_mov_b32 s9, s3 +; GFX1264-NEXT: s_mov_b32 s4, s0 +; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_wait_loadcnt 0x0 +; GFX1264-NEXT: global_inv scope:SCOPE_DEV +; GFX1264-NEXT: s_mov_b32 s5, s1 +; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1264-NEXT: s_nop 0 +; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264-NEXT: s_endpgm +; +; GFX1232-LABEL: add_i64_varying: +; GFX1232: ; %bb.0: ; %entry +; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s6, -1 +; GFX1232-NEXT: s_mov_b32 s11, s7 +; GFX1232-NEXT: s_mov_b32 s10, s6 +; GFX1232-NEXT: s_wait_kmcnt 0x0 +; GFX1232-NEXT: s_mov_b32 s8, s2 +; GFX1232-NEXT: s_mov_b32 s9, s3 +; GFX1232-NEXT: s_mov_b32 s4, s0 +; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: s_wait_loadcnt 0x0 +; GFX1232-NEXT: global_inv scope:SCOPE_DEV +; GFX1232-NEXT: s_mov_b32 s5, s1 +; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1232-NEXT: s_nop 0 +; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -1837,7 +1890,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -1869,7 +1922,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -1902,7 +1955,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -1935,7 +1988,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: sub_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1970,7 +2023,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -2004,7 +2057,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: sub_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -2042,7 +2095,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: sub_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -2079,7 +2132,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: sub_i32_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -2116,7 +2169,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: sub_i32_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232-NEXT: s_mov_b32 s5, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -2158,25 +2211,25 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %subitive) { ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s8, s6 +; GFX7LESS-NEXT: s_mov_b32 s9, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB7_2: @@ -2185,36 +2238,36 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s2, s8, s2 +; GFX8-NEXT: s_mul_i32 s0, s8, s0 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 @@ -2226,29 +2279,29 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s2, s8, s2 +; GFX9-NEXT: s_mul_i32 s0, s8, s0 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 @@ -2261,20 +2314,20 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s8, s2 +; GFX1064-NEXT: s_mul_i32 s2, s10, s2 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b32 s12, s6 @@ -2287,7 +2340,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s10, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -2298,21 +2351,21 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_mul_i32 s2, s0, s2 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc @@ -2321,9 +2374,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 @@ -2334,33 +2387,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 +; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB7_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s2, s8, s2 -; GFX1164-NEXT: s_mov_b32 s14, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc +; GFX1164-NEXT: s_mul_i32 s3, s2, s3 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-NEXT: s_mov_b32 s8, s6 +; GFX1164-NEXT: s_mov_b32 s9, s7 +; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB7_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX1164-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s6, -1 @@ -2374,17 +2427,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX1132-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 @@ -2413,32 +2466,32 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: sub_i32_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1264-NEXT: s_load_b32 s8, s[0:1], 0x34 -; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 +; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264-NEXT: s_cbranch_execz .LBB7_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s2, s8, s2 -; GFX1264-NEXT: s_mov_b32 s14, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s2 -; GFX1264-NEXT: s_mov_b32 s12, s6 -; GFX1264-NEXT: s_mov_b32 s13, s7 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_mul_i32 s3, s2, s3 +; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-NEXT: s_mov_b32 s8, s6 +; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB7_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX1264-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 @@ -2452,17 +2505,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: sub_i32_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1232-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX1232-NEXT: s_mov_b32 s2, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB7_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_i32 s2, s0, s2 @@ -2495,7 +2548,7 @@ entry: define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS-LABEL: sub_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s10, s6 @@ -2513,22 +2566,22 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB8_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s4 ; GFX8-NEXT: v_readlane_b32 s7, v0, s4 ; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8-NEXT: v_writelane_b32 v1, s6, m0 ; GFX8-NEXT: s_add_i32 s6, s6, s7 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -2558,22 +2611,22 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB8_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s4 ; GFX9-NEXT: v_readlane_b32 s7, v0, s4 ; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9-NEXT: v_writelane_b32 v1, s6, m0 ; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -2603,21 +2656,21 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s6, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s7, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7 ; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064-NEXT: s_add_i32 s6, s6, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -2649,21 +2702,21 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s3 -; GFX1032-NEXT: v_writelane_b32 v1, s4, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s6 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s6 ; GFX1032-NEXT: s_add_i32 s4, s4, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 @@ -2694,49 +2747,51 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s6, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s7 ; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1164-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_add_i32 s6, s6, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s8, s2 ; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB8_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 @@ -2745,47 +2800,49 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s3 -; GFX1132-NEXT: v_writelane_b32 v1, s4, s3 -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6 +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1132-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: s_add_i32 s4, s4, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1132-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_mov_b32_e32 v1, s4 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s8, s2 ; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB8_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 @@ -2794,48 +2851,50 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264-LABEL: sub_i32_varying: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1264-NEXT: s_mov_b64 s[0:1], exec ; GFX1264-NEXT: s_mov_b32 s6, 0 -; GFX1264-NEXT: ; implicit-def: $vgpr1 +; GFX1264-NEXT: ; implicit-def: $vgpr0 ; GFX1264-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3] -; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1264-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1264-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264-NEXT: v_readlane_b32 s8, v1, s7 ; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1264-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1264-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1264-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1264-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1264-NEXT: ; implicit-def: $vgpr0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264-NEXT: s_cbranch_execz .LBB8_4 ; GFX1264-NEXT: ; %bb.3: -; GFX1264-NEXT: v_mov_b32_e32 v0, s6 +; GFX1264-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB8_4: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264-NEXT: s_nop 0 @@ -2844,46 +2903,48 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232-LABEL: sub_i32_varying: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_mov_b32 s2, exec_lo +; GFX1232-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1232-NEXT: s_mov_b32 s0, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, 0 -; GFX1232-NEXT: ; implicit-def: $vgpr1 +; GFX1232-NEXT: ; implicit-def: $vgpr0 ; GFX1232-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1232-NEXT: v_readlane_b32 s5, v0, s3 -; GFX1232-NEXT: s_lshl_b32 s6, 1, s3 -; GFX1232-NEXT: v_writelane_b32 v1, s4, s3 -; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6 +; GFX1232-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1232-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1232-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232-NEXT: s_add_co_i32 s4, s4, s5 -; GFX1232-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1232-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1232-NEXT: ; implicit-def: $vgpr0 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1232-NEXT: s_cbranch_execz .LBB8_4 ; GFX1232-NEXT: ; %bb.3: -; GFX1232-NEXT: v_mov_b32_e32 v0, s4 +; GFX1232-NEXT: v_mov_b32_e32 v1, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB8_4: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_nop 0 @@ -2900,7 +2961,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -2938,7 +2999,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -2976,7 +3037,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -3014,7 +3075,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -3053,7 +3114,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -3091,7 +3152,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -3133,7 +3194,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -3173,7 +3234,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: sub_i64_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -3215,7 +3276,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: sub_i64_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: s_mov_b32 s5, 0 ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 @@ -3262,8 +3323,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -3306,8 +3367,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 @@ -3347,24 +3408,24 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX9-NEXT: s_mul_i32 s7, s1, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s0, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: s_mul_i32 s6, s0, s6 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -3373,12 +3434,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB10_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s0, v2, 0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s1, v2, v[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 @@ -3391,22 +3452,22 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s3, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 -; GFX1064-NEXT: s_mul_i32 s8, s2, s8 +; GFX1064-NEXT: s_mul_i32 s9, s1, s8 +; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s8 +; GFX1064-NEXT: s_mul_i32 s8, s0, s8 ; GFX1064-NEXT: s_add_i32 s10, s10, s9 ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 @@ -3419,12 +3480,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB10_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s0, v2, 0 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s1, v2, v[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3 @@ -3436,23 +3497,23 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s3, s1 -; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_mul_i32 s8, s1, s3 +; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s3 +; GFX1032-NEXT: s_mul_i32 s3, s0, s3 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_mov_b32 s8, s6 @@ -3463,14 +3524,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB10_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v2, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s0, v2, 0 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5] +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s1, v2, v[4:5] ; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo @@ -3480,8 +3541,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -3529,17 +3590,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1132-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s8 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s8, s1, s3 @@ -3577,8 +3638,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: sub_i64_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -3623,18 +3684,18 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: sub_i64_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX1232-NEXT: s_mov_b32 s2, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1232-NEXT: s_mov_b32 s9, exec_lo ; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB10_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] @@ -3672,7 +3733,7 @@ entry: define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS-LABEL: sub_i64_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -3691,7 +3752,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX89-LABEL: sub_i64_varying: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -3710,7 +3771,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: sub_i64_varying: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 @@ -3728,48 +3789,93 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: sub_i64_varying: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: sub_i64_varying: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX1164-LABEL: sub_i64_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_mov_b32 s11, s7 +; GFX1164-NEXT: s_mov_b32 s10, s6 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 +; GFX1164-NEXT: s_mov_b32 s4, s0 +; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: s_mov_b32 s5, s1 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: sub_i64_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_mov_b32 s11, s7 +; GFX1132-NEXT: s_mov_b32 s10, s6 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 +; GFX1132-NEXT: s_mov_b32 s4, s0 +; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: s_mov_b32 s5, s1 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm +; +; GFX1264-LABEL: sub_i64_varying: +; GFX1264: ; %bb.0: ; %entry +; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s6, -1 +; GFX1264-NEXT: s_mov_b32 s11, s7 +; GFX1264-NEXT: s_mov_b32 s10, s6 +; GFX1264-NEXT: s_wait_kmcnt 0x0 +; GFX1264-NEXT: s_mov_b32 s8, s2 +; GFX1264-NEXT: s_mov_b32 s9, s3 +; GFX1264-NEXT: s_mov_b32 s4, s0 +; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_wait_loadcnt 0x0 +; GFX1264-NEXT: global_inv scope:SCOPE_DEV +; GFX1264-NEXT: s_mov_b32 s5, s1 +; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1264-NEXT: s_nop 0 +; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264-NEXT: s_endpgm +; +; GFX1232-LABEL: sub_i64_varying: +; GFX1232: ; %bb.0: ; %entry +; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s6, -1 +; GFX1232-NEXT: s_mov_b32 s11, s7 +; GFX1232-NEXT: s_mov_b32 s10, s6 +; GFX1232-NEXT: s_wait_kmcnt 0x0 +; GFX1232-NEXT: s_mov_b32 s8, s2 +; GFX1232-NEXT: s_mov_b32 s9, s3 +; GFX1232-NEXT: s_mov_b32 s4, s0 +; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: s_wait_loadcnt 0x0 +; GFX1232-NEXT: global_inv scope:SCOPE_DEV +; GFX1232-NEXT: s_mov_b32 s5, s1 +; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1232-NEXT: s_nop 0 +; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -3777,3 +3883,6 @@ entry: store i64 %old, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index b0b40aa952a9f..3784af443c7f1 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -24,7 +24,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -35,8 +35,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB0_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -52,7 +52,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -63,8 +63,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -80,7 +80,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -90,8 +90,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -107,7 +107,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -119,8 +119,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -131,24 +132,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, s3 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -160,7 +162,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -177,8 +179,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB0_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -192,24 +194,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 +; GFX1132-NEXT: s_mul_i32 s1, s1, 5 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB0_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -232,12 +234,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -249,8 +251,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -262,13 +264,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -280,8 +282,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -293,13 +295,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -310,8 +312,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -323,13 +325,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB1_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -342,8 +344,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -354,39 +357,40 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB1_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s4, s2, s4 +; GFX1032-NEXT: s_mul_i32 s4, s0, s4 ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] +; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c +; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -404,8 +408,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB1_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -419,9 +423,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1132-LABEL: add_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -430,22 +434,22 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s4, s2, s4 +; GFX1132-NEXT: s_mul_i32 s4, s0, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB1_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0 +; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] +; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -460,7 +464,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: add_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -473,27 +477,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -502,8 +506,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -514,27 +518,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -542,8 +546,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -554,26 +558,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_add_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB2_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -583,8 +587,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 @@ -595,36 +600,37 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_add_i32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_add_i32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB2_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 @@ -635,43 +641,45 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_add_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB2_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB2_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -681,41 +689,42 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_add_i32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_add_i32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB2_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB2_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -854,16 +863,17 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1164-LABEL: add_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s2, 0 ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: s_add_i32 s2, s2, s6 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 @@ -886,16 +896,17 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1132-LABEL: add_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: s_add_i32 s0, s0, s3 ; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 @@ -929,7 +940,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -940,8 +951,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB4_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -962,7 +973,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -973,10 +984,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] @@ -994,7 +1005,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1004,10 +1015,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] @@ -1025,7 +1036,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1037,8 +1048,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] @@ -1050,24 +1062,25 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] @@ -1080,7 +1093,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -1097,8 +1110,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1113,25 +1126,25 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: s_mul_i32 s1, s1, 5 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132-NEXT: v_mov_b32_e32 v0, s1 ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1155,7 +1168,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1196,7 +1209,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1234,7 +1247,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1272,7 +1285,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1064-LABEL: add_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1308,7 +1321,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1032-LABEL: add_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1343,7 +1356,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1164-LABEL: add_i64_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1384,7 +1397,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1132-LABEL: add_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1432,7 +1445,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: add_i64_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1447,7 +1460,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1459,7 +1472,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9-LABEL: add_i64_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1471,7 +1484,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX10-LABEL: add_i64_varying: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1481,20 +1494,36 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_i64_varying: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX1164-LABEL: add_i64_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i64_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -1513,7 +1542,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1524,8 +1553,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB7_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1542,7 +1571,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1553,8 +1582,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -1571,7 +1600,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1581,8 +1610,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -1599,7 +1628,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1611,8 +1640,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -1624,24 +1654,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, s3 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -1654,7 +1685,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: sub_i32_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1671,8 +1702,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB7_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -1687,24 +1718,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 +; GFX1132-NEXT: s_mul_i32 s1, s1, 5 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB7_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -1728,12 +1759,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1745,8 +1776,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB8_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1758,13 +1789,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1776,8 +1807,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB8_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -1789,13 +1820,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1806,8 +1837,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -1819,13 +1850,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1838,8 +1869,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -1851,40 +1882,40 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s4, s2, s4 +; GFX1032-NEXT: s_mul_i32 s4, s0, s4 ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c +; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1902,8 +1933,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB8_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 @@ -1918,9 +1949,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1929,23 +1960,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s4, s2, s4 +; GFX1132-NEXT: s_mul_i32 s4, s0, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB8_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -1960,7 +1991,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: sub_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1973,27 +2004,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB9_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB9_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -2002,8 +2033,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB9_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -2014,27 +2045,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB9_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB9_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2042,8 +2073,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB9_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -2054,26 +2085,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_add_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB9_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -2083,8 +2114,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB9_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 @@ -2095,36 +2127,37 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_add_i32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_add_i32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB9_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB9_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 @@ -2135,43 +2168,45 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_add_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB9_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB9_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -2181,41 +2216,42 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_add_i32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_add_i32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB9_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB9_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -2354,16 +2390,17 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1164-LABEL: sub_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s2, 0 ; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: s_add_i32 s2, s2, s6 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 @@ -2386,16 +2423,17 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1132-LABEL: sub_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: s_add_i32 s0, s0, s3 ; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 @@ -2429,7 +2467,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2440,8 +2478,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB11_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -2462,7 +2500,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2473,8 +2511,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB11_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2495,7 +2533,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2505,8 +2543,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB11_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2527,7 +2565,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2539,8 +2577,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB11_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -2555,24 +2594,25 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -2588,7 +2628,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -2605,8 +2645,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB11_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -2624,25 +2664,25 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: s_mul_i32 s1, s1, 5 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132-NEXT: v_mov_b32_e32 v0, s1 ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB11_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -2669,7 +2709,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -2710,7 +2750,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -2749,7 +2789,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -2789,7 +2829,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -2828,7 +2868,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -2866,7 +2906,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -2909,7 +2949,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -2959,7 +2999,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: sub_i64_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2974,7 +3014,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2986,7 +3026,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9-LABEL: sub_i64_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2998,7 +3038,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX10-LABEL: sub_i64_varying: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3008,20 +3048,36 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: sub_i64_varying: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX1164-LABEL: sub_i64_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: sub_i64_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -3035,7 +3091,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: and_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3048,27 +3104,27 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB14_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_and_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB14_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3077,8 +3133,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB14_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3089,27 +3145,27 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB14_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3117,8 +3173,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB14_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3129,26 +3185,26 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: and_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_and_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB14_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3158,8 +3214,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB14_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1 @@ -3170,36 +3227,37 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: and_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_and_b32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_and_b32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB14_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB14_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1 @@ -3210,43 +3268,45 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: and_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_and_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB14_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX1164-NEXT: ds_and_rtn_b32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB14_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1164-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3256,41 +3316,42 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: and_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_and_b32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_and_b32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB14_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_and_rtn_b32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB14_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1132-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3309,7 +3370,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: or_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3322,27 +3383,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB15_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB15_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3351,8 +3412,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB15_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3363,27 +3424,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB15_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_or_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB15_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3391,8 +3452,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB15_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3403,26 +3464,26 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_or_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB15_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3432,8 +3493,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB15_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1 @@ -3444,36 +3506,37 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_or_b32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_or_b32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB15_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB15_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1 @@ -3484,43 +3547,45 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: or_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_or_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB15_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX1164-NEXT: ds_or_rtn_b32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB15_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1164-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3530,41 +3595,42 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: or_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_or_b32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_or_b32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB15_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_or_rtn_b32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB15_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1132-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3583,7 +3649,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: xor_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3596,27 +3662,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB16_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_xor_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB16_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3625,8 +3691,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB16_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3637,27 +3703,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB16_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_xor_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3665,8 +3731,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB16_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3677,26 +3743,26 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_xor_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB16_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3706,8 +3772,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB16_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1 @@ -3718,36 +3785,37 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_xor_b32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_xor_b32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB16_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB16_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1 @@ -3758,43 +3826,45 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: xor_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_xor_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB16_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX1164-NEXT: ds_xor_rtn_b32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB16_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3804,41 +3874,42 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: xor_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_xor_b32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_xor_b32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB16_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_xor_rtn_b32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB16_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3857,7 +3928,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: max_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3870,27 +3941,27 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_brev_b32 s4, 1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB17_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_max_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB17_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3899,8 +3970,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB17_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3911,27 +3982,27 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_brev_b32 s4, 1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB17_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_max_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB17_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3939,8 +4010,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB17_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3951,26 +4022,26 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_brev_b32 s4, 1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_max_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB17_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3980,8 +4051,9 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB17_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1 @@ -3992,36 +4064,37 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_brev_b32 s2, 1 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_brev_b32 s0, 1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_max_i32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_max_i32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB17_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB17_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1 @@ -4032,43 +4105,45 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: max_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_brev_b32 s4, 1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_max_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB17_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX1164-NEXT: ds_max_rtn_i32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB17_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1164-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4078,41 +4153,42 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: max_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_brev_b32 s2, 1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_brev_b32 s0, 1 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_max_i32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_max_i32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB17_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_max_rtn_i32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB17_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1132-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4135,7 +4211,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -4145,8 +4221,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB18_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -4169,7 +4245,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB18_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -4179,10 +4255,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB18_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc @@ -4203,7 +4279,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -4212,10 +4288,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB18_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc @@ -4236,7 +4312,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB18_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -4247,8 +4323,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB18_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -4267,7 +4344,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB18_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -4278,8 +4355,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB18_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -4300,7 +4378,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -4310,8 +4388,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB18_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -4334,7 +4412,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -4343,8 +4421,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB18_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -4371,7 +4449,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: min_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -4384,27 +4462,27 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB19_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_min_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB19_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -4413,8 +4491,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB19_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -4425,27 +4503,27 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB19_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_min_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB19_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -4453,8 +4531,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB19_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -4465,26 +4543,26 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_brev_b32 s4, -2 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_min_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB19_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -4494,8 +4572,9 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB19_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1 @@ -4506,36 +4585,37 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_brev_b32 s2, -2 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_brev_b32 s0, -2 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_min_i32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_min_i32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB19_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB19_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1 @@ -4546,43 +4626,45 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: min_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_brev_b32 s4, -2 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_min_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB19_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX1164-NEXT: ds_min_rtn_i32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB19_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1164-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4592,41 +4674,42 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: min_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_brev_b32 s2, -2 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_brev_b32 s0, -2 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_min_i32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_min_i32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB19_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_min_rtn_i32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB19_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1132-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4649,7 +4732,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -4659,8 +4742,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB20_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -4683,7 +4766,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -4693,10 +4776,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB20_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -4717,7 +4800,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -4726,10 +4809,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB20_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -4750,7 +4833,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB20_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -4761,8 +4844,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB20_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -4781,7 +4865,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB20_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -4792,8 +4876,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB20_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -4814,7 +4899,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB20_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -4824,8 +4909,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB20_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -4848,7 +4933,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB20_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -4857,8 +4942,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB20_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -4885,7 +4970,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: umax_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -4898,27 +4983,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB21_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_max_u32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB21_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -4927,8 +5012,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB21_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -4939,27 +5024,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB21_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_max_u32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB21_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -4967,8 +5052,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB21_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -4979,26 +5064,26 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_max_u32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB21_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -5008,8 +5093,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB21_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1 @@ -5020,36 +5106,37 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_max_u32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_max_u32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB21_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB21_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1 @@ -5060,43 +5147,45 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: umax_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_max_u32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB21_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1164-NEXT: ds_max_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB21_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1164-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5106,41 +5195,42 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: umax_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_max_u32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_max_u32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB21_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_max_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB21_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1132-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5163,7 +5253,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -5173,8 +5263,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB22_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -5196,7 +5286,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -5206,8 +5296,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB22_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5229,7 +5319,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -5238,8 +5328,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB22_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -5261,7 +5351,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -5272,8 +5362,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5292,7 +5383,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -5303,8 +5394,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5325,7 +5417,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB22_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -5335,8 +5427,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB22_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5359,7 +5451,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB22_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -5368,8 +5460,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB22_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 @@ -5396,7 +5488,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: umin_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5409,27 +5501,27 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB23_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_min_u32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB23_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -5438,8 +5530,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB23_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -5450,27 +5542,27 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB23_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_min_u32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB23_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -5478,8 +5570,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB23_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -5490,26 +5582,26 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: umin_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_min_u32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB23_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -5519,8 +5611,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB23_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1 @@ -5531,36 +5624,37 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: umin_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_min_u32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_min_u32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB23_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB23_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1 @@ -5571,43 +5665,45 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: umin_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_min_u32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB23_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX1164-NEXT: ds_min_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB23_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1164-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5617,41 +5713,42 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: umin_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_min_u32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_min_u32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB23_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_min_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB23_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1132-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5674,7 +5771,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -5684,8 +5781,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB24_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -5707,7 +5804,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB24_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -5717,8 +5814,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB24_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5740,7 +5837,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -5749,8 +5846,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB24_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5772,7 +5869,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB24_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -5783,8 +5880,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB24_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5803,7 +5901,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB24_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -5814,8 +5912,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB24_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -5836,7 +5935,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB24_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -5846,8 +5945,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB24_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5870,7 +5969,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB24_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -5879,8 +5978,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB24_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -5901,3 +6000,5 @@ entry: store i64 %old, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index ca94d68f01917..995d3fee67291 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -23,18 +23,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -51,18 +51,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -79,18 +79,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -106,10 +106,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -117,9 +117,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -129,24 +130,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -157,7 +159,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -165,7 +167,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -173,8 +175,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -188,24 +190,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -220,7 +222,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -228,7 +230,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -236,8 +238,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -251,24 +253,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: add_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -289,23 +291,23 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -318,24 +320,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -348,24 +350,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -377,16 +379,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -394,9 +396,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -406,37 +409,37 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -444,7 +447,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -452,8 +455,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -467,41 +470,41 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -509,7 +512,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -517,8 +520,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -532,32 +535,32 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -570,8 +573,8 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -582,36 +585,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -623,36 +626,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -663,37 +666,38 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -703,36 +707,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 -; GFX10W32-NEXT: s_add_i32 s2, s2, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -742,174 +747,182 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX11W32-NEXT: s_add_i32 s2, s2, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB2_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 +; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 +; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 +; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -923,8 +936,8 @@ entry: define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc @@ -936,9 +949,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -949,9 +962,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -961,9 +974,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -971,33 +985,67 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_i32_varying_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11W64-LABEL: add_i32_varying_offset: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_varying_offset: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm ; -; GFX12-LABEL: add_i32_varying_offset: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX12W64-LABEL: add_i32_varying_offset: +; GFX12W64: ; %bb.0: ; %entry +; GFX12W64-NEXT: s_clause 0x1 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: s_nop 0 +; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W64-NEXT: s_endpgm +; +; GFX12W32-LABEL: add_i32_varying_offset: +; GFX12W32: ; %bb.0: ; %entry +; GFX12W32-NEXT: s_clause 0x1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: s_nop 0 +; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) @@ -1013,18 +1061,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB4_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB4_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1042,18 +1090,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1071,18 +1119,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1099,10 +1147,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -1110,9 +1158,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB4_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1123,24 +1172,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB4_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1152,7 +1202,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1160,7 +1210,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1168,8 +1218,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB4_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1184,24 +1234,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB4_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1217,7 +1267,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1225,7 +1275,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1233,8 +1283,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB4_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1249,24 +1299,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: sub_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB4_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1288,23 +1338,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1317,24 +1367,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1347,24 +1397,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1376,16 +1426,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1393,8 +1443,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1406,38 +1456,38 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1445,7 +1495,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1453,8 +1503,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1469,42 +1519,42 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1512,7 +1562,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1520,8 +1570,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1536,33 +1586,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1575,8 +1625,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -1587,36 +1637,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB6_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB6_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -1628,36 +1678,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB6_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB6_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1668,37 +1718,38 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1708,36 +1759,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 -; GFX10W32-NEXT: s_add_i32 s2, s2, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1747,176 +1799,184 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX11W32-NEXT: s_add_i32 s2, s2, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB6_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1930,8 +1990,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc @@ -1943,9 +2003,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1956,9 +2016,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1968,9 +2028,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1978,36 +2039,73 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: sub_i32_varying_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11W64-LABEL: sub_i32_varying_offset: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm ; -; GFX12-LABEL: sub_i32_varying_offset: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11W32-LABEL: sub_i32_varying_offset: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm +; +; GFX12W64-LABEL: sub_i32_varying_offset: +; GFX12W64: ; %bb.0: ; %entry +; GFX12W64-NEXT: s_clause 0x1 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: s_nop 0 +; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W64-NEXT: s_endpgm +; +; GFX12W32-LABEL: sub_i32_varying_offset: +; GFX12W32: ; %bb.0: ; %entry +; GFX12W32-NEXT: s_clause 0x1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: s_nop 0 +; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 7e15c07f95269..720e2ef108076 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -23,10 +23,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 @@ -34,8 +34,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -52,10 +52,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 @@ -63,8 +63,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -81,10 +81,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 @@ -92,8 +92,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -109,10 +109,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 @@ -121,9 +121,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -133,25 +134,26 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -162,7 +164,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -170,7 +172,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -179,8 +181,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -194,25 +196,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -227,7 +229,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -235,7 +237,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -244,8 +246,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -259,24 +261,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: add_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -297,15 +299,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 @@ -313,8 +315,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -327,16 +329,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 @@ -344,8 +346,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -358,16 +360,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 @@ -375,8 +377,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -388,16 +390,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) @@ -406,9 +408,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -418,38 +421,38 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -457,7 +460,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -466,8 +469,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -481,42 +484,42 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -524,7 +527,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -533,8 +536,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -548,32 +551,32 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -586,8 +589,8 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc @@ -599,37 +602,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -641,37 +644,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -682,38 +685,39 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -723,37 +727,38 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 -; GFX10W32-NEXT: s_add_i32 s2, s2, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -763,178 +768,184 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB2_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX11W32-NEXT: s_add_i32 s2, s2, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB2_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 +; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 +; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 +; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -948,8 +959,8 @@ entry: define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vindex: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc @@ -961,9 +972,9 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_vindex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 idxen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -974,9 +985,9 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_vindex: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -986,9 +997,10 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: add_i32_varying_vindex: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -996,33 +1008,67 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_i32_varying_vindex: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: add_i32_varying_vindex: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11W64-LABEL: add_i32_varying_vindex: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_varying_vindex: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm +; +; GFX12W64-LABEL: add_i32_varying_vindex: +; GFX12W64: ; %bb.0: ; %entry +; GFX12W64-NEXT: s_clause 0x1 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: s_nop 0 +; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W64-NEXT: s_endpgm +; +; GFX12W32-LABEL: add_i32_varying_vindex: +; GFX12W32: ; %bb.0: ; %entry +; GFX12W32-NEXT: s_clause 0x1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: s_nop 0 +; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0) @@ -1034,10 +1080,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s2, 0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s8, 0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: v_mov_b32_e32 v2, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc @@ -1049,15 +1095,14 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1066,27 +1111,27 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1096,12 +1141,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W64-LABEL: add_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W64-NEXT: s_mov_b32 s2, 0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W64-NEXT: s_mov_b32 s0, 0 +; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s0 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 @@ -1113,12 +1158,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W32-LABEL: add_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_mov_b32 s0, 0 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2 +; GFX11W32-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 @@ -1130,11 +1175,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W64-LABEL: add_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX12W64-NEXT: s_clause 0x1 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1145,10 +1191,11 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W32-LABEL: add_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 +; GFX12W32-NEXT: s_clause 0x1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1171,10 +1218,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 @@ -1182,8 +1229,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1201,10 +1248,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 @@ -1212,8 +1259,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1231,10 +1278,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 @@ -1242,8 +1289,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1260,10 +1307,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1272,9 +1319,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1285,25 +1333,26 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1315,7 +1364,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1323,7 +1372,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1332,8 +1381,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1348,25 +1397,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1382,7 +1431,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1390,7 +1439,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1399,8 +1448,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1415,24 +1464,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: sub_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1454,15 +1503,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 @@ -1470,8 +1519,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB6_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1484,16 +1533,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 @@ -1501,8 +1550,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1515,16 +1564,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 @@ -1532,8 +1581,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1545,16 +1594,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) @@ -1563,8 +1612,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1576,39 +1625,39 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1616,7 +1665,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -1625,8 +1674,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB6_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1641,43 +1690,43 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1685,7 +1734,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1694,8 +1743,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1710,33 +1759,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1749,8 +1798,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc @@ -1762,37 +1811,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB7_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB7_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -1804,37 +1853,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB7_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1845,38 +1894,39 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) +; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1886,37 +1936,38 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s0, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 -; GFX10W32-NEXT: s_add_i32 s2, s2, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1926,180 +1977,186 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W64-NEXT: s_mov_b64 s[0:1], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB7_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11W32-NEXT: s_mov_b32 s2, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX11W32-NEXT: s_add_i32 s2, s2, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB7_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[2:3], exec +; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 +; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB7_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s3, exec_lo -; GFX12W32-NEXT: s_mov_b32 s2, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s0, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 -; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 -; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 +; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -2113,8 +2170,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vindex: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc @@ -2126,9 +2183,9 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_vindex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2139,9 +2196,9 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_vindex: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2151,9 +2208,10 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: sub_i32_varying_vindex: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2161,33 +2219,67 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: sub_i32_varying_vindex: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: sub_i32_varying_vindex: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11W64-LABEL: sub_i32_varying_vindex: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_varying_vindex: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm +; +; GFX12W64-LABEL: sub_i32_varying_vindex: +; GFX12W64: ; %bb.0: ; %entry +; GFX12W64-NEXT: s_clause 0x1 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W64-NEXT: s_nop 0 +; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W64-NEXT: s_endpgm +; +; GFX12W32-LABEL: sub_i32_varying_vindex: +; GFX12W32: ; %bb.0: ; %entry +; GFX12W32-NEXT: s_clause 0x1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12W32-NEXT: s_nop 0 +; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0) @@ -2199,10 +2291,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s2, 0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s8, 0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: v_mov_b32_e32 v2, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc @@ -2214,15 +2306,14 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2231,27 +2322,27 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2261,12 +2352,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W64-LABEL: sub_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W64-NEXT: s_mov_b32 s2, 0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W64-NEXT: s_mov_b32 s0, 0 +; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s0 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 @@ -2278,12 +2369,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W32-LABEL: sub_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_mov_b32 s0, 0 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2 +; GFX11W32-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 @@ -2295,11 +2386,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W64-LABEL: sub_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX12W64-NEXT: s_clause 0x1 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -2310,10 +2402,11 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W32-LABEL: sub_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 +; GFX12W32-NEXT: s_clause 0x1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -2327,3 +2420,6 @@ entry: store i32 %old, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 2c69ae58f0e61..417d38990505b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -18,7 +18,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -33,7 +33,7 @@ entry: define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -42,7 +42,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -58,8 +58,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 @@ -72,8 +72,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5 @@ -92,7 +92,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -100,7 +100,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %a ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -114,7 +114,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 @@ -124,7 +124,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 @@ -140,11 +140,11 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -155,8 +155,8 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr ; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN @@ -175,7 +175,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -185,7 +185,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -201,7 +201,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -211,7 +211,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -227,7 +227,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { ; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16 @@ -238,7 +238,7 @@ define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ; ; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 7da058ca6ee7e..14519f5a5e77c 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"} ; Exactly 10 waves per execution unit. ; CHECK-LABEL: {{^}}exactly_10: -; CHECK: SGPRBlocks: 1 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 5 -; CHECK: NumSGPRsForWavesPerEU: 12 +; CHECK: NumSGPRsForWavesPerEU: 20 ; CHECK: NumVGPRsForWavesPerEU: 24 define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, ptr addrspace(1) @var diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll index b2f01660201d7..90562e25a3e9c 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll @@ -1,6 +1,6 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=OPT %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O2 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=OPT %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV4 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV5 %s ; Check that AMDGPUAttributor is not run with -O0. ; OPT: .amdhsa_user_sgpr_private_segment_buffer 1 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index a86a3f6f279d7..16ffdd7ebe421 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3781,21 +3781,21 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v2, s30, 0 ; GCN-NEXT: v_writelane_b32 v2, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen @@ -3806,27 +3806,27 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s18, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_mov_b64 exec, s[16:17] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[4:5] -; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: s_getpc_b64 s[16:17] +; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX7-NEXT: v_writelane_b32 v2, s30, 0 ; GFX7-NEXT: v_writelane_b32 v2, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen @@ -3837,27 +3837,27 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s18, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[16:17] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[4:5] -; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 @@ -3866,27 +3866,27 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 @@ -3895,28 +3895,28 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s18, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s16 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 @@ -3926,7 +3926,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3968,21 +3968,21 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v2bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v4, s30, 0 ; GCN-NEXT: v_writelane_b32 v4, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2 @@ -3998,27 +3998,27 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v2bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s18, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_mov_b64 exec, s[16:17] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[4:5] -; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: s_getpc_b64 s[16:17] +; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX7-NEXT: v_writelane_b32 v4, s30, 0 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -4034,27 +4034,27 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v2bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s18, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[16:17] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[4:5] -; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 @@ -4063,27 +4063,27 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v2bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 @@ -4092,28 +4092,28 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s18, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s16 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 @@ -4123,7 +4123,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4165,21 +4165,21 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v3bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v5, s30, 0 ; GCN-NEXT: v_writelane_b32 v5, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4197,27 +4197,27 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v3bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s18, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_mov_b64 exec, s[16:17] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[4:5] -; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: s_getpc_b64 s[16:17] +; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX7-NEXT: v_writelane_b32 v4, s30, 0 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4235,27 +4235,27 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v3bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s18, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[16:17] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[4:5] -; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX8-NEXT: v_writelane_b32 v4, s30, 0 ; GFX8-NEXT: v_writelane_b32 v4, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 ; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4267,27 +4267,27 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v3bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen @@ -4298,28 +4298,28 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v3bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s18, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s16 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen @@ -4331,7 +4331,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4375,21 +4375,21 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v4bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v8, s30, 0 ; GCN-NEXT: v_writelane_b32 v8, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4415,27 +4415,27 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v4bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s18, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX7-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_mov_b64 exec, s[16:17] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[4:5] -; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: s_getpc_b64 s[16:17] +; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX7-NEXT: v_writelane_b32 v6, s30, 0 ; GFX7-NEXT: v_writelane_b32 v6, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -4461,27 +4461,27 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v4bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s18, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[16:17] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[4:5] -; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX8-NEXT: v_writelane_b32 v4, s30, 0 ; GFX8-NEXT: v_writelane_b32 v4, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 ; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4493,27 +4493,27 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v4bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen @@ -4524,28 +4524,28 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v4bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s18, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s16 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen @@ -4557,7 +4557,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4599,21 +4599,21 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v8bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v16, s30, 0 ; GCN-NEXT: v_writelane_b32 v16, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4659,27 +4659,27 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v8bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s18, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX7-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_mov_b64 exec, s[16:17] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[4:5] -; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: s_getpc_b64 s[16:17] +; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX7-NEXT: v_writelane_b32 v10, s30, 0 ; GFX7-NEXT: v_writelane_b32 v10, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -4725,27 +4725,27 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v8bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s18, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[16:17] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[4:5] -; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX8-NEXT: v_writelane_b32 v6, s30, 0 ; GFX8-NEXT: v_writelane_b32 v6, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 12, v4 ; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4763,27 +4763,27 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v8bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX9-NEXT: v_writelane_b32 v5, s30, 0 ; GFX9-NEXT: v_writelane_b32 v5, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 @@ -4798,28 +4798,28 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v8bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s18, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s16 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v5, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX10-NEXT: v_writelane_b32 v5, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 @@ -4835,7 +4835,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4877,21 +4877,21 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v16bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v21, s30, 0 ; GCN-NEXT: v_writelane_b32 v21, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4977,27 +4977,27 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v16bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s18, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX7-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: s_mov_b64 exec, s[16:17] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[4:5] -; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: s_getpc_b64 s[16:17] +; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX7-NEXT: v_writelane_b32 v18, s30, 0 ; GFX7-NEXT: v_writelane_b32 v18, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 @@ -5083,27 +5083,27 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v16bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s18, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX8-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[16:17] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[4:5] -; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX8-NEXT: v_writelane_b32 v10, s30, 0 ; GFX8-NEXT: v_writelane_b32 v10, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 28, v8 ; GFX8-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -5133,27 +5133,27 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v16bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX9-NEXT: v_writelane_b32 v9, s30, 0 ; GFX9-NEXT: v_writelane_b32 v9, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 @@ -5176,28 +5176,28 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v16bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s18, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s16 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v9, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX10-NEXT: v_writelane_b32 v9, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 @@ -5221,7 +5221,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -27297,7 +27297,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_and_b32 s4, s4, 0x80000000 +; GCN-NEXT: s_and_b32 s4, s6, 0x80000000 ; GCN-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_or_b32_e32 v0, s4, v0 @@ -27308,7 +27308,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_and_b32 s4, s4, 0x80000000 +; GFX7-NEXT: s_and_b32 s4, s6, 0x80000000 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, s4, v0 @@ -27318,23 +27318,23 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX8-LABEL: v_copysign_bf16_s_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_bf16_s_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s5, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_bf16_s_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_copysign_bf16_s_bf16: @@ -27350,7 +27350,7 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GCN-LABEL: v_copysign_s_bf16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 @@ -27361,7 +27361,7 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GFX7-LABEL: v_copysign_s_bf16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 @@ -27372,23 +27372,23 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GFX8-LABEL: v_copysign_s_bf16_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_bfi_b32 v0, s5, v1, v0 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_s_bf16_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s5, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_s_bf16_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s6, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_copysign_s_bf16_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll index 0f20ed1320dad..2c179de2a9c35 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll @@ -6,10 +6,10 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) { ; VI-LABEL: bfe_combine8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -24,11 +24,11 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) ; ; VI-SDWA-LABEL: bfe_combine8: ; VI-SDWA: ; %bb.0: -; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDWA-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 2 ; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -42,13 +42,13 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) ; ; CI-LABEL: bfe_combine8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dword s4, s[2:3], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 6, v0 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: v_and_b32_e32 v0, 0x3fc, v0 @@ -71,11 +71,11 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x) { ; VI-LABEL: bfe_combine16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 16, 16 ; VI-NEXT: v_lshlrev_b32_e32 v0, 15, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -91,11 +91,11 @@ define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x ; ; VI-SDWA-LABEL: bfe_combine16: ; VI-SDWA: ; %bb.0: -; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDWA-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 15 ; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 0 ; VI-SDWA-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -111,13 +111,13 @@ define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x ; ; CI-LABEL: bfe_combine16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dword s4, s[2:3], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; CI-NEXT: v_and_b32_e32 v0, 0x7fff8000, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index af4116bd6aae5..f54ea615ca664 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -115,7 +115,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -175,7 +175,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -221,7 +221,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -253,7 +253,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -313,7 +313,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -329,7 +329,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -373,7 +373,7 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -417,8 +417,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ; ; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -444,8 +444,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -462,8 +462,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou ; ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -491,8 +491,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -509,8 +509,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ; ; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index 7b8eaccaa4142..78d764898a3b9 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -11,52 +11,50 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_def_i32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s0, s4 -; GFX7-NEXT: s_mov_b32 s1, s5 -; GFX7-NEXT: s_andn2_b32 s4, s8, s6 -; GFX7-NEXT: s_and_b32 s5, s7, s6 -; GFX7-NEXT: s_or_b32 s4, s4, s5 +; GFX7-NEXT: s_andn2_b32 s6, s6, s4 +; GFX7-NEXT: s_and_b32 s4, s5, s4 +; GFX7-NEXT: s_or_b32 s4, s6, s4 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_def_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s7, s6 -; GFX8-NEXT: s_andn2_b32 s0, s0, s6 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_andn2_b32 s2, s6, s4 +; GFX8-NEXT: s_and_b32 s3, s5, s4 +; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_def_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s1, s7, s6 -; GFX10-NEXT: s_andn2_b32 s0, s0, s6 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_andn2_b32 s2, s6, s4 +; GFX10-NEXT: s_and_b32 s3, s5, s4 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_def_i32: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_and_b32 s1, s7, s6 ; GFX8-GISEL-NEXT: s_andn2_b32 s0, s0, s6 @@ -70,8 +68,8 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, ; GFX10-GISEL-LABEL: s_bfi_def_i32: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s6 @@ -132,52 +130,50 @@ entry: define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s0, s4 -; GFX7-NEXT: s_xor_b32 s4, s7, s8 -; GFX7-NEXT: s_and_b32 s4, s6, s4 -; GFX7-NEXT: s_xor_b32 s4, s8, s4 -; GFX7-NEXT: s_mov_b32 s1, s5 +; GFX7-NEXT: s_xor_b32 s5, s5, s6 +; GFX7-NEXT: s_and_b32 s4, s4, s5 +; GFX7-NEXT: s_xor_b32 s4, s6, s4 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_xor_b32 s1, s7, s0 -; GFX8-NEXT: s_and_b32 s1, s6, s1 -; GFX8-NEXT: s_xor_b32 s0, s0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_xor_b32 s2, s5, s6 +; GFX8-NEXT: s_and_b32 s2, s4, s2 +; GFX8-NEXT: s_xor_b32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_sha256_ch: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b32 s1, s7, s0 -; GFX10-NEXT: s_and_b32 s1, s6, s1 -; GFX10-NEXT: s_xor_b32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_xor_b32 s2, s5, s6 +; GFX10-NEXT: s_and_b32 s2, s4, s2 +; GFX10-NEXT: s_xor_b32 s2, s6, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ch: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-GISEL-NEXT: s_xor_b32 s1, s7, s0 @@ -191,8 +187,8 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX10-GISEL-LABEL: s_bfi_sha256_ch: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_xor_b32 s1, s7, s0 @@ -458,55 +454,53 @@ entry: define amdgpu_kernel void @s_bfi_sha256_ma(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_sha256_ma: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s1, s5 -; GFX7-NEXT: s_or_b32 s5, s6, s8 -; GFX7-NEXT: s_mov_b32 s0, s4 -; GFX7-NEXT: s_and_b32 s4, s6, s8 -; GFX7-NEXT: s_and_b32 s5, s7, s5 -; GFX7-NEXT: s_or_b32 s4, s4, s5 +; GFX7-NEXT: s_and_b32 s7, s4, s6 +; GFX7-NEXT: s_or_b32 s4, s4, s6 +; GFX7-NEXT: s_and_b32 s4, s5, s4 +; GFX7-NEXT: s_or_b32 s4, s7, s4 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ma: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_and_b32 s1, s6, s0 -; GFX8-NEXT: s_or_b32 s0, s6, s0 -; GFX8-NEXT: s_and_b32 s0, s7, s0 -; GFX8-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_or_b32 s3, s4, s6 +; GFX8-NEXT: s_and_b32 s2, s4, s6 +; GFX8-NEXT: s_and_b32 s3, s5, s3 +; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_sha256_ma: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_or_b32 s1, s6, s0 -; GFX10-NEXT: s_and_b32 s0, s6, s0 -; GFX10-NEXT: s_and_b32 s1, s7, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_or_b32 s2, s4, s6 +; GFX10-NEXT: s_and_b32 s3, s4, s6 +; GFX10-NEXT: s_and_b32 s2, s5, s2 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ma: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s0 @@ -521,8 +515,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX10-GISEL-LABEL: s_bfi_sha256_ma: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_or_b32 s1, s6, s0 @@ -1408,8 +1402,8 @@ entry: define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_0: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1425,8 +1419,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-LABEL: s_bitselect_i64_pat_0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1441,8 +1435,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1456,8 +1450,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1472,8 +1466,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_0: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1496,8 +1490,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1513,8 +1507,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-LABEL: s_bitselect_i64_pat_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1529,8 +1523,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1544,8 +1538,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1560,8 +1554,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1584,8 +1578,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1601,8 +1595,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-LABEL: s_bitselect_i64_pat_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1617,8 +1611,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1632,8 +1626,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1648,8 +1642,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_2: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1672,8 +1666,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX7-LABEL: s_bfi_sha256_ma_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1690,8 +1684,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; ; GFX8-LABEL: s_bfi_sha256_ma_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1] ; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] @@ -1707,8 +1701,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX10-LABEL: s_bfi_sha256_ma_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1] ; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] @@ -1723,8 +1717,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1] ; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] @@ -1740,8 +1734,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX10-GISEL-LABEL: s_bfi_sha256_ma_i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1] ; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll index 0f40576a7459c..4ad3667f68958 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll @@ -283,7 +283,7 @@ define float @v_bfi_single_constant_as_partition(float %x, float %y, float %z) { define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: v_bfi_dont_applied_for_scalar_ops: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll index f8bd44b7c98f5..2e64db12ef564 100644 --- a/llvm/test/CodeGen/AMDGPU/bfm.ll +++ b/llvm/test/CodeGen/AMDGPU/bfm.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) #0 { ; SI-LABEL: s_bfm_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfm_b32 s2, s2, s3 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) ; ; VI-LABEL: s_bfm_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfm_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -36,11 +36,11 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #0 { ; SI-LABEL: s_bfm_pattern_simple: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfm_b32 s4, s2, 0 +; SI-NEXT: s_bfm_b32 s4, s4, 0 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -48,10 +48,10 @@ define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) # ; ; VI-LABEL: s_bfm_pattern_simple: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfm_b32 s2, s2, 0 +; VI-NEXT: s_bfm_b32 s2, s4, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index 64555f14a55cc..6f52da2631b8a 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -21,8 +21,8 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #0 { ; SI-LABEL: s_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -34,8 +34,8 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; FLAT-LABEL: s_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; GISEL-LABEL: s_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s2, s2, 0xffff +; GISEL-NEXT: s_and_b32 s2, s4, 0xffff ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -62,10 +62,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-FLAT-LABEL: s_brev_i16: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] @@ -76,11 +76,11 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-LABEL: s_brev_i16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0xffff ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16 @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_mov_b32 s10, s6 @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i16: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v1, 0 @@ -168,7 +168,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] @@ -187,8 +187,8 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #0 { ; SI-LABEL: s_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -199,8 +199,8 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; ; FLAT-LABEL: s_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -211,10 +211,10 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; ; GISEL-LABEL: s_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_brev_b32 s2, s2 +; GISEL-NEXT: s_brev_b32 s2, s4 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -224,11 +224,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-FLAT-LABEL: s_brev_i32: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 @@ -240,11 +240,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-GISEL-LABEL: s_brev_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 +; GFX11-GISEL-NEXT: s_brev_b32 s2, s4 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -259,7 +259,7 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -278,7 +278,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -294,7 +294,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -311,7 +311,9 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b32 v0, v0, s[2:3] @@ -326,8 +328,10 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -347,7 +351,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> %val) #0 { ; SI-LABEL: s_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -362,7 +366,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -377,7 +381,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GISEL-LABEL: s_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_brev_b32 s3, s3 @@ -390,7 +394,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-FLAT-LABEL: s_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -407,7 +411,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-GISEL-LABEL: s_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 @@ -426,7 +430,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -446,7 +450,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -463,7 +467,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -481,7 +485,9 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -497,9 +503,11 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -520,7 +528,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -534,7 +542,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -548,7 +556,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GISEL-LABEL: s_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -560,7 +568,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-FLAT-LABEL: s_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[2:3] ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 @@ -573,7 +581,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-GISEL-LABEL: s_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] @@ -591,7 +599,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -611,7 +619,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -628,7 +636,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -646,7 +654,9 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -662,7 +672,9 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -685,8 +697,8 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) #0 { ; SI-LABEL: s_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -701,8 +713,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; ; FLAT-LABEL: s_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -717,8 +729,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; ; GISEL-LABEL: s_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] ; GISEL-NEXT: s_brev_b64 s[2:3], s[6:7] @@ -734,8 +746,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-FLAT-LABEL: s_brev_v2i64: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[4:5] ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[6:7] @@ -751,8 +763,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-GISEL-LABEL: s_brev_v2i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] @@ -771,7 +783,7 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -793,7 +805,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -812,7 +824,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -832,7 +844,9 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b128 v[0:3], v0, s[2:3] @@ -850,7 +864,9 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b128 v[0:3], v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index 3dbbb877918ad..857b13fab8a7c 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -6,18 +6,18 @@ define amdgpu_kernel void @br_cc_f16( ; SI-LABEL: br_cc_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -29,28 +29,28 @@ define amdgpu_kernel void @br_cc_f16( ; SI-NEXT: .LBB0_2: ; %two ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: .LBB0_3: ; %one -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: br_cc_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_cbranch_vccnz .LBB0_2 ; VI-NEXT: ; %bb.1: ; %one @@ -63,8 +63,8 @@ define amdgpu_kernel void @br_cc_f16( ; GFX11-LABEL: br_cc_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s2 @@ -111,7 +111,7 @@ two: define amdgpu_kernel void @br_cc_f16_imm_a( ; SI-LABEL: br_cc_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -137,7 +137,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; ; VI-LABEL: br_cc_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -157,7 +157,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; ; GFX11-LABEL: br_cc_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -195,7 +195,7 @@ two: define amdgpu_kernel void @br_cc_f16_imm_b( ; SI-LABEL: br_cc_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -221,7 +221,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; ; VI-LABEL: br_cc_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; ; GFX11-LABEL: br_cc_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index 6201d7341898f..adfc177c8bf74 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -4,10 +4,10 @@ define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-LABEL: spill: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s44, s[4:5], 0x2 +; CHECK-NEXT: s_load_dword s44, s[6:7], 0x2 ; CHECK-NEXT: s_mov_b64 s[98:99], s[2:3] ; CHECK-NEXT: s_mov_b64 s[96:97], s[0:1] -; CHECK-NEXT: s_add_u32 s96, s96, s7 +; CHECK-NEXT: s_add_u32 s96, s96, s13 ; CHECK-NEXT: s_addc_u32 s97, s97, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_eq_u32 s44, 0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 2f637df4e9302..635f3e4886b87 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -22,9 +22,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_max_short_forward_branch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART @@ -34,10 +34,10 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: .LBB0_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -63,9 +63,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.3: ; %bb0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -81,10 +81,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -110,9 +110,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s2, 0 +; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_cbranch_vccz .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %bb0 @@ -130,10 +130,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -158,7 +158,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -261,28 +261,28 @@ bb3: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { ; GCN-LABEL: uniform_unconditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s2, 0 -; GCN-NEXT: s_mov_b64 s[2:3], -1 +; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_mov_b64 s[0:1], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB5_1 ; GCN-NEXT: ; %bb.7: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: .Lpost_getpc5: -; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_4-.Lpost_getpc5)&4294967295 -; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_4-.Lpost_getpc5)>>32 -; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: s_add_u32 s0, s0, (.LBB5_4-.Lpost_getpc5)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB5_4-.Lpost_getpc5)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB5_1: ; %Flow -; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_cbranch_vccnz .LBB5_3 ; GCN-NEXT: .LBB5_2: ; %bb2 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB5_3: ; %bb4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -300,17 +300,17 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_cbranch_execnz .LBB5_5 ; GCN-NEXT: ; %bb.9: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: .Lpost_getpc6: -; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_2-.Lpost_getpc6)&4294967295 -; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_2-.Lpost_getpc6)>>32 -; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc6)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB5_5: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: .Lpost_getpc4: -; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_3-.Lpost_getpc4)&4294967295 -; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_3-.Lpost_getpc4)>>32 -; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: s_add_u32 s0, s0, (.LBB5_3-.Lpost_getpc4)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB5_3-.Lpost_getpc4)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -375,7 +375,7 @@ loop: define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { ; GCN-LABEL: expand_requires_expand: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lt_i32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -453,8 +453,8 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_addc_u32 s1, s1, (.LBB8_3-.Lpost_getpc9)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB8_1: ; %if -; GCN-NEXT: s_load_dword s6, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s6, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -572,10 +572,10 @@ ret: define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 { ; GCN-LABEL: long_branch_hang: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s4, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_cmp_lt_i32 s7, 6 @@ -607,25 +607,25 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GCN-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-NEXT: ; %bb.10: ; %Flow5 -; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: .Lpost_getpc13: -; GCN-NEXT: s_add_u32 s2, s2, (.LBB10_6-.Lpost_getpc13)&4294967295 -; GCN-NEXT: s_addc_u32 s3, s3, (.LBB10_6-.Lpost_getpc13)>>32 -; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc13)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB10_5: ; %bb14 ; GCN-NEXT: s_cmp_lt_i32 s5, 9 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_cmp_lt_i32 s6, s7 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: s_branch .LBB10_7 ; GCN-NEXT: .LBB10_6: ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB10_7: ; %bb19 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xf -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xf +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index e4c7df385d861..321a7ceb826f6 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -19,7 +19,7 @@ declare i48 @llvm.bswap.i48(i48) #1 define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -34,7 +34,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_bswap_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_bswap_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -69,7 +69,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -87,7 +87,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -103,7 +103,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -124,7 +124,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -148,7 +148,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -166,7 +166,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -189,7 +189,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -226,7 +226,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0x10203 ; VI-NEXT: s_mov_b32 s15, 0xf000 ; VI-NEXT: s_mov_b32 s14, -1 @@ -249,7 +249,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v8i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -296,7 +296,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_bswap_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -312,7 +312,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_bswap_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -333,7 +333,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -357,7 +357,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -375,7 +375,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v2i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -398,7 +398,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -435,7 +435,7 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0x10203 ; VI-NEXT: s_mov_b32 s15, 0xf000 ; VI-NEXT: s_mov_b32 s14, -1 @@ -458,7 +458,7 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v4i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index d50ba64ba5d47..0cdd6b919f1c8 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -13,35 +13,35 @@ ; float ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -49,15 +49,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -66,36 +70,44 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -103,26 +115,30 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -130,26 +146,30 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -157,26 +177,30 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -185,51 +209,51 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB0_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst ret float %result } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -237,14 +261,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -252,97 +280,117 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s18 +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -350,25 +398,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst ret void } -define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -401,7 +449,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[2:3], exec @@ -429,7 +477,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -459,7 +507,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 @@ -521,7 +569,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[6:7], exec @@ -547,7 +595,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 @@ -605,7 +653,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 @@ -663,7 +711,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 @@ -720,7 +768,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 @@ -778,8026 +826,3307 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst ret float %result } -define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB3_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: s_add_i32 s4, s18, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s6, s18, 0x800 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s6, s18, 0x800 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst + ret double %result } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB4_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x800 +; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v10, v5 +; GFX10-NEXT: v_mov_b32_e32 v9, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v10, v5 +; GFX908-NEXT: v_mov_b32_e32 v9, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v7 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v8 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, v5 +; GFX8-NEXT: v_mov_b32_e32 v9, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v7 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v8 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s6, s18, 0x800 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v10, v5 +; GFX7-NEXT: v_mov_b32_e32 v9, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s6, s18, 0x800 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v10, v5 +; GFX6-NEXT: v_mov_b32_e32 v9, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst ret void } -define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB5_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6] +; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB5_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB5_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB5_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB5_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB5_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v9 +; GFX10-NEXT: v_readfirstlane_b32 s9, v10 +; GFX10-NEXT: v_readfirstlane_b32 s10, v7 +; GFX10-NEXT: v_readfirstlane_b32 s11, v8 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-NEXT: ; Child Loop BB5_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v0, v11 +; GFX10-NEXT: v_mov_b32_e32 v1, v12 +; GFX10-NEXT: v_mov_b32_e32 v2, v13 +; GFX10-NEXT: v_mov_b32_e32 v3, v14 +; GFX10-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v9 +; GFX10-NEXT: v_readfirstlane_b32 s9, v10 +; GFX10-NEXT: v_readfirstlane_b32 s10, v7 +; GFX10-NEXT: v_readfirstlane_b32 s11, v8 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB5_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX10-NEXT: v_mov_b32_e32 v14, v1 +; GFX10-NEXT: v_mov_b32_e32 v13, v0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB5_3 +; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX908-NEXT: v_mov_b32_e32 v14, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v13, v0 +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB5_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB5_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB5_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v9 +; GFX7-NEXT: v_readfirstlane_b32 s9, v10 +; GFX7-NEXT: v_readfirstlane_b32 s10, v7 +; GFX7-NEXT: v_readfirstlane_b32 s11, v8 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Loop Header: Depth=1 +; GFX7-NEXT: ; Child Loop BB5_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX7-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v0, v11 +; GFX7-NEXT: v_mov_b32_e32 v1, v12 +; GFX7-NEXT: v_mov_b32_e32 v2, v13 +; GFX7-NEXT: v_mov_b32_e32 v3, v14 +; GFX7-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX7-NEXT: v_readfirstlane_b32 s8, v9 +; GFX7-NEXT: v_readfirstlane_b32 s9, v10 +; GFX7-NEXT: v_readfirstlane_b32 s10, v7 +; GFX7-NEXT: v_readfirstlane_b32 s11, v8 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX7-NEXT: v_mov_b32_e32 v14, v1 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v13, v0 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB5_3 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v9 +; GFX6-NEXT: v_readfirstlane_b32 s9, v10 +; GFX6-NEXT: v_readfirstlane_b32 s10, v7 +; GFX6-NEXT: v_readfirstlane_b32 s11, v8 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB5_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Loop Header: Depth=1 +; GFX6-NEXT: ; Child Loop BB5_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v11 +; GFX6-NEXT: v_mov_b32_e32 v1, v12 +; GFX6-NEXT: v_mov_b32_e32 v2, v13 +; GFX6-NEXT: v_mov_b32_e32 v3, v14 +; GFX6-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX6-NEXT: v_readfirstlane_b32 s8, v9 +; GFX6-NEXT: v_readfirstlane_b32 s9, v10 +; GFX6-NEXT: v_readfirstlane_b32 s10, v7 +; GFX6-NEXT: v_readfirstlane_b32 s11, v8 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB5_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX6-NEXT: v_mov_b32_e32 v14, v1 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v13, v0 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_cbranch_execnz .LBB5_3 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst - ret float %result + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst + ret double %result } -define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: +; -------------------------------------------------------------------- +; half +; -------------------------------------------------------------------- + +define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 ; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v5, s4 +; GFX908-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v3, s7, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret float %result + %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + ret half %result } -define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 ; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v4, s7, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 - ret float %result + %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + ret void } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: +define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-NEXT: v_not_b32_e32 v11, v7 +; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB8_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB8_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v11, v6 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Loop Header: Depth=1 +; GFX940-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX940-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX940-NEXT: s_mov_b64 s[8:9], exec +; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB8_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v7, v8 ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB8_3 +; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-NEXT: v_not_b32_e32 v11, v7 +; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB8_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB8_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_addk_i32 s8, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s8 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v11, v7 +; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX10-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB8_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB8_3 +; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v11, v6 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Loop Header: Depth=1 +; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v7, v8 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_3 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v11, v6 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX908-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX908-NEXT: v_mov_b32_e32 v9, v7 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB8_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB8_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 +; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v11, v6 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, v7 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB8_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB8_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s10, s8, 0x800 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s10, s8, 0x800 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result -} - -define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] -; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_mov_b32_e32 v9, v4 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v4, v7 -; GFX10-NEXT: v_mov_b32_e32 v5, v8 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s8 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX908-NEXT: v_mov_b32_e32 v10, v5 -; GFX908-NEXT: v_mov_b32_e32 v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v7 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: v_mov_b32_e32 v5, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, v5 -; GFX8-NEXT: v_mov_b32_e32 v9, v4 -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v7 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v5, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s10, s8, 0x800 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v10, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v5, v8 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s10, s8, 0x800 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v5 -; GFX6-NEXT: v_mov_b32_e32 v9, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v5, v8 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB10_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB10_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6] -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX12-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB10_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB10_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB10_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v9 -; GFX10-NEXT: v_readfirstlane_b32 s9, v10 -; GFX10-NEXT: v_readfirstlane_b32 s10, v7 -; GFX10-NEXT: v_readfirstlane_b32 s11, v8 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: .LBB10_3: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB10_4 Depth 2 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mov_b32_e32 v3, v14 -; GFX10-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 -; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_readfirstlane_b32 s8, v9 -; GFX10-NEXT: v_readfirstlane_b32 s9, v10 -; GFX10-NEXT: v_readfirstlane_b32 s10, v7 -; GFX10-NEXT: v_readfirstlane_b32 s11, v8 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX10-NEXT: v_mov_b32_e32 v14, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, v0 -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB10_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB10_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 -; GFX908-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX908-NEXT: v_mov_b32_e32 v14, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v13, v0 -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB10_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB10_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB10_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v13, v0 -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB10_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4 -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v9 -; GFX7-NEXT: v_readfirstlane_b32 s9, v10 -; GFX7-NEXT: v_readfirstlane_b32 s10, v7 -; GFX7-NEXT: v_readfirstlane_b32 s11, v8 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB10_3: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB10_4 Depth 2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v0, v11 -; GFX7-NEXT: v_mov_b32_e32 v1, v12 -; GFX7-NEXT: v_mov_b32_e32 v2, v13 -; GFX7-NEXT: v_mov_b32_e32 v3, v14 -; GFX7-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 -; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX7-NEXT: v_readfirstlane_b32 s8, v9 -; GFX7-NEXT: v_readfirstlane_b32 s9, v10 -; GFX7-NEXT: v_readfirstlane_b32 s10, v7 -; GFX7-NEXT: v_readfirstlane_b32 s11, v8 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX7-NEXT: v_mov_b32_e32 v14, v1 -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v13, v0 -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB10_3 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v9 -; GFX6-NEXT: v_readfirstlane_b32 s9, v10 -; GFX6-NEXT: v_readfirstlane_b32 s10, v7 -; GFX6-NEXT: v_readfirstlane_b32 s11, v8 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB10_3: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB10_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v11 -; GFX6-NEXT: v_mov_b32_e32 v1, v12 -; GFX6-NEXT: v_mov_b32_e32 v2, v13 -; GFX6-NEXT: v_mov_b32_e32 v3, v14 -; GFX6-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 -; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX6-NEXT: v_readfirstlane_b32 s8, v9 -; GFX6-NEXT: v_readfirstlane_b32 s9, v10 -; GFX6-NEXT: v_readfirstlane_b32 s10, v7 -; GFX6-NEXT: v_readfirstlane_b32 s11, v8 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX6-NEXT: v_mov_b32_e32 v14, v1 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v13, v0 -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB10_3 -; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result -} - -define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_addk_i32 s8, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s8 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s10, s8, 0x800 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s10, s8, 0x800 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret double %result -} - -define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_addk_i32 s8, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s8 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s10, s8, 0x800 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s10, s8, 0x800 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 - ret double %result -} - -; -------------------------------------------------------------------- -; half -; -------------------------------------------------------------------- - -define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v5, s5 -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v5, s5 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v5, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s9 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3 -; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v4 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v5, s9 -; GFX908-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2 -; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2 -; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s11, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v3 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s9 -; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s9 -; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret half %result -} - -define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v3, s5 -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v3, s5 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v3, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s9 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3 -; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v3, s9 -; GFX908-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2 -; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2 -; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v4, s11, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v11, v7 -; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v8 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB15_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v11, v6 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX940-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v8 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB15_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v11, v7 -; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v11, v7 -; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v9, v7 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 -; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v11, v6 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v8 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v11, v6 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX908-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX908-NEXT: v_mov_b32_e32 v9, v7 -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v8, v6 -; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v8 -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v11, v6 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX8-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, v7 -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v8, v6 -; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v8 -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX7-NEXT: v_not_b32_e32 v9, v4 -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB15_3 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_not_b32_e32 v9, v4 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB15_3 -; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret half %result -} - -; -------------------------------------------------------------------- -; bfloat -; -------------------------------------------------------------------- - -define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v4, s5 -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s5 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v4, s5 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v4, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s9 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s9 -; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s9 -; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s9 -; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret bfloat %result -} - -define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v2, s5 -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s5 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v2, s5 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s9 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s9 -; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB18_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX940-NEXT: v_not_b32_e32 v10, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_add_f32_e32 v4, v4, v11 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB18_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB18_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 -; GFX90A-NEXT: v_not_b32_e32 v10, v4 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX908-NEXT: v_not_b32_e32 v9, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB18_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX8-NEXT: v_not_b32_e32 v9, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB18_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX7-NEXT: v_not_b32_e32 v9, v4 -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB18_3 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_not_b32_e32 v9, v4 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB18_3 -; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret bfloat %result -} - -; -------------------------------------------------------------------- -; <2 x half> -; -------------------------------------------------------------------- - -define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x half> %result -} - -define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, v5 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v7, v8, v5 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 -; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB21_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v6 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v7, v8, v5 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v6, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 -; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB21_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v6 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v7, v8, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v7, v8 -; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v8, v6 -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB21_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v6, v8, v5 -; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v7, v8 -; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v8, v6 -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB21_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 -; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v10 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB21_3 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v10 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB21_3 -; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x half> %result -} - -define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result -} - -define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst - ret void -} - -define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret <2 x half> %result -} - -define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 +; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_not_b32_e32 v9, v4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Loop Header: Depth=1 +; GFX7-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB8_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB8_3 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 +; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Loop Header: Depth=1 +; GFX6-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB8_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_cbranch_execnz .LBB8_3 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret void + %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + ret half %result } ; -------------------------------------------------------------------- -; <2 x bfloat> +; bfloat ; -------------------------------------------------------------------- -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x bfloat> %result + %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + ret bfloat %result } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 -; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst ret void } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-NEXT: v_not_b32_e32 v9, v6 +; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -8808,25 +4137,80 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, v5 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB11_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -8836,41 +4220,32 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 -; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX940-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_mov_b64 s[8:9], exec +; GFX940-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 ; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v10 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -8882,10 +4257,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -8893,19 +4268,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB28_3 +; GFX940-NEXT: s_cbranch_execnz .LBB11_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-NEXT: v_not_b32_e32 v9, v6 +; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -8916,43 +4298,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 -; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 -; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -8965,10 +4343,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -8978,20 +4356,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB28_3 +; GFX11-NEXT: s_cbranch_execnz .LBB11_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX10-NEXT: v_not_b32_e32 v9, v6 +; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -9000,39 +4383,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX10-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -9043,11 +4418,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB11_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -9057,18 +4432,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB28_3 +; GFX10-NEXT: s_cbranch_execnz .LBB11_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 +; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -9078,39 +4459,30 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 -; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -9121,10 +4493,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -9132,18 +4504,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -9153,40 +4531,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX908-NEXT: s_mov_b32 s15, 0x7060302 -; GFX908-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 -; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -9197,10 +4566,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB11_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -9208,18 +4577,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB28_3 +; GFX908-NEXT: s_cbranch_execnz .LBB11_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 +; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -9229,41 +4604,32 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -9274,10 +4640,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB11_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -9285,18 +4651,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB28_3 +; GFX8-NEXT: s_cbranch_execnz .LBB11_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 +; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -9305,37 +4676,29 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -9346,31 +4709,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB11_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB28_3 +; GFX7-NEXT: s_cbranch_execnz .LBB11_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 +; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -9379,857 +4746,1118 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Loop Header: Depth=1 +; GFX6-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB11_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_cbranch_execnz .LBB11_3 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + ret bfloat %result +} + +; -------------------------------------------------------------------- +; <2 x half> +; -------------------------------------------------------------------- + +define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX6-NEXT: .LBB28_3: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 -; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB28_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB28_3 -; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x bfloat> %result + %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s18 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst - ret <2 x bfloat> %result + %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + ret void } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_addk_i32 s4, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_pk_add_f16 v7, v8, v5 +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB14_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 -; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_pk_add_f16 v7, v8, v5 +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB14_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB14_3 +; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX908-NEXT: v_pk_add_f16 v7, v8, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB14_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB14_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v6, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v7, v8 +; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB14_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB14_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Loop Header: Depth=1 +; GFX7-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v10 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v11 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB14_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB14_3 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Loop Header: Depth=1 +; GFX6-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v10 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v11 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB14_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_cbranch_execnz .LBB14_3 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst - ret void + %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; -------------------------------------------------------------------- +; <2 x bfloat> +; -------------------------------------------------------------------- + +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_add_i32 s4, s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v0 @@ -10256,16 +5884,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -10274,7 +5902,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v0 @@ -10308,24 +5936,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -10341,38 +5973,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -10387,36 +6023,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -10431,35 +6071,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -10476,40 +6120,44 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -10523,35 +6171,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -10566,52 +6218,52 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_add_i32 s4, s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -10638,23 +6290,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -10686,23 +6338,27 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 -; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -10717,38 +6373,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -10762,36 +6422,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s18 +; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -10805,35 +6469,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -10849,41 +6517,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -10897,35 +6569,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -10940,418 +6616,700 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX940-NEXT: s_movk_i32 s10, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX940-NEXT: s_mov_b32 s11, 0x7060302 +; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Loop Header: Depth=1 +; GFX940-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX940-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v10 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 +; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB17_3 +; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_addk_i32 s4, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 -; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB17_3 +; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX90A-NEXT: s_movk_i32 s14, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 +; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Loop Header: Depth=1 +; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 +; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX908-NEXT: s_movk_i32 s14, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX908-NEXT: s_mov_b32 s15, 0x7060302 +; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 +; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB17_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB17_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Loop Header: Depth=1 +; GFX7-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB17_3 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Loop Header: Depth=1 +; GFX6-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB17_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_cbranch_execnz .LBB17_3 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 - ret void + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result } ; -------------------------------------------------------------------- ; misc ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -11366,32 +7324,31 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -11407,21 +7364,25 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -11429,137 +7390,157 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v0 ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v0 @@ -11567,22 +7548,22 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fadd ptr addrspace(7) %gep, float %val seq_cst ret float %result } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } -!0 = !{} + diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 06dee9c279f2c..503065cc07647 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -13,28 +13,28 @@ ; float ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_addk_i32 s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -57,10 +57,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -68,27 +68,35 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -96,27 +104,31 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -125,27 +137,31 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -154,61 +170,69 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_addk_i32 s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -231,10 +255,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -242,53 +266,65 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s18 +; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -296,27 +332,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -324,43 +364,51 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst ret void } -define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -393,7 +441,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -454,7 +502,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -484,7 +532,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, exec_lo @@ -513,7 +561,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -572,7 +620,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 @@ -632,7 +680,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 @@ -692,7 +740,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[6:7], exec @@ -718,7 +766,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[6:7], exec @@ -745,357 +793,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result -} - -define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret float %result -} - -define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst ret float %result } @@ -1103,8 +801,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; double ; -------------------------------------------------------------------- -define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1112,14 +810,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 @@ -1137,33 +835,33 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 @@ -1182,43 +880,55 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_cbranch_execnz .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v10, v1 @@ -1229,29 +939,33 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v10, v1 @@ -1262,56 +976,64 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst ret double %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] @@ -1328,32 +1050,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -1371,41 +1093,53 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_cbranch_execnz .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s8 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -1414,29 +1148,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v9, v2 ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX908-NEXT: v_mov_b32_e32 v2, v7 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -1445,44 +1183,52 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst ret void } -define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1493,7 +1239,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_readfirstlane_b32 s4, v9 ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 @@ -1508,14 +1254,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB5_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -1524,7 +1270,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v9 ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 @@ -1539,8 +1285,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB5_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] @@ -1549,19 +1295,19 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB7_3 +; GFX12-NEXT: s_cbranch_execnz .LBB5_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: v_mov_b32_e32 v6, v5 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -1576,7 +1322,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1585,7 +1331,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 @@ -1593,7 +1339,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10 @@ -1608,14 +1354,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB5_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -1624,7 +1370,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10 @@ -1639,8 +1385,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB5_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] @@ -1650,17 +1396,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-NEXT: s_cbranch_execnz .LBB5_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -1675,7 +1421,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_cbranch_execnz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1685,609 +1431,223 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: v_mov_b32_e32 v6, v5 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 -; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX908-NEXT: v_mov_b32_e32 v14, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v13, v0 -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB7_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v13, v0 -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB7_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result -} - -define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret double %result -} - -define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 ; GFX908-NEXT: v_mov_b32_e32 v10, v1 ; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX908-NEXT: v_mov_b32_e32 v14, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB5_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 ; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB5_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB5_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst ret double %result } @@ -2295,26 +1655,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; half ; -------------------------------------------------------------------- -define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v4, s5 +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2336,26 +1697,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s5 +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_and_b32 s4, s6, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2372,28 +1733,29 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cbranch_execnz .LBB6_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v4, s5 +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2416,264 +1778,289 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v4, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s9 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s9 -; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s9 -; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s9 -; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst ret half %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2695,25 +2082,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_and_b32 s4, s6, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2730,27 +2117,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2773,237 +2161,261 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s9 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s9 -; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst ret void } -define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3019,7 +2431,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -3032,14 +2444,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -3054,7 +2466,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3069,8 +2481,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB8_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -3079,13 +2491,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB12_3 +; GFX12-NEXT: s_cbranch_execnz .LBB8_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -3096,7 +2508,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -3108,14 +2520,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_cbranch_execnz .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 @@ -3125,7 +2537,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -3139,8 +2551,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB8_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -3148,13 +2560,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB12_3 +; GFX940-NEXT: s_cbranch_execnz .LBB8_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -3167,7 +2579,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -3180,14 +2592,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -3202,7 +2614,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -3217,8 +2629,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB8_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -3228,13 +2640,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB12_3 +; GFX11-NEXT: s_cbranch_execnz .LBB8_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -3245,7 +2657,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -3257,13 +2669,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -3274,7 +2686,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -3288,8 +2700,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB8_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -3299,13 +2711,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB12_3 +; GFX10-NEXT: s_cbranch_execnz .LBB8_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -3316,7 +2728,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -3328,14 +2740,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 @@ -3344,7 +2756,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -3357,8 +2769,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB8_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -3366,13 +2778,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB8_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -3383,7 +2795,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -3395,14 +2807,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 @@ -3412,7 +2824,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -3425,8 +2837,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB8_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3434,13 +2846,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB12_3 +; GFX908-NEXT: s_cbranch_execnz .LBB8_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -3451,7 +2863,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -3463,14 +2875,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 @@ -3481,7 +2893,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -3494,8 +2906,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB8_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3503,13 +2915,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB12_3 +; GFX8-NEXT: s_cbranch_execnz .LBB8_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -3519,7 +2931,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -3530,15 +2942,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -3550,7 +2962,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -3563,8 +2975,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB8_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3572,14 +2984,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB12_3 +; GFX7-NEXT: s_cbranch_execnz .LBB8_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -3589,7 +3001,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -3600,15 +3012,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -3620,7 +3032,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -3633,8 +3045,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB8_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3642,7 +3054,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB12_3 +; GFX6-NEXT: s_cbranch_execnz .LBB8_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -3650,7 +3062,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst ret half %result } @@ -3658,26 +3070,27 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; bfloat ; -------------------------------------------------------------------- -define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v4, s5 +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3706,27 +3119,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s5 +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_and_b32 s4, s6, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3748,28 +3161,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cbranch_execnz .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v4, s5 +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3799,29 +3213,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v4, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 @@ -3829,121 +3247,133 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s9 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s9 -; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v3, v3, v5 @@ -3953,132 +3383,141 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s9 -; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s9 -; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -4107,26 +3546,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_and_b32 s4, s6, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4148,27 +3587,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -4198,28 +3638,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 @@ -4227,118 +3671,130 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s9 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s9 -; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 @@ -4348,109 +3804,117 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst ret void } -define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4466,7 +3930,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -4479,14 +3943,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -4508,7 +3972,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -4523,8 +3987,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4533,13 +3997,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-NEXT: s_cbranch_execnz .LBB11_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4550,7 +4014,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -4562,15 +4026,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec @@ -4585,7 +4049,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -4599,8 +4063,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4608,13 +4072,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB15_3 +; GFX940-NEXT: s_cbranch_execnz .LBB11_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4627,7 +4091,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -4640,15 +4104,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -4670,7 +4134,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -4685,8 +4149,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4696,14 +4160,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-NEXT: s_cbranch_execnz .LBB11_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4714,7 +4178,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -4726,13 +4190,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -4747,7 +4211,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -4761,8 +4225,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB11_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4772,13 +4236,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_3 +; GFX10-NEXT: s_cbranch_execnz .LBB11_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4789,7 +4253,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -4801,15 +4265,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v11 @@ -4822,7 +4286,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -4835,8 +4299,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4844,13 +4308,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4861,7 +4325,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -4873,15 +4337,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v4, v4, v10 @@ -4895,7 +4359,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -4908,8 +4372,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB11_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4917,13 +4381,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_3 +; GFX908-NEXT: s_cbranch_execnz .LBB11_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -4934,7 +4398,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -4946,14 +4410,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v4, v4, v10 @@ -4969,7 +4433,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -4982,8 +4446,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB11_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4991,13 +4455,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_3 +; GFX8-NEXT: s_cbranch_execnz .LBB11_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -5007,7 +4471,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -5018,15 +4482,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -5039,7 +4503,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -5052,8 +4516,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB11_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5061,14 +4525,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB15_3 +; GFX7-NEXT: s_cbranch_execnz .LBB11_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -5078,7 +4542,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -5089,15 +4553,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -5110,7 +4574,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -5123,8 +4587,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB11_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5132,7 +4596,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB15_3 +; GFX6-NEXT: s_cbranch_execnz .LBB11_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -5140,7 +4604,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } @@ -5148,22 +4612,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -5180,22 +4644,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_addk_i32 s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -5210,22 +4674,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -5243,22 +4707,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -5267,57 +4735,65 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -5325,29 +4801,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -5358,34 +4838,38 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -5401,37 +4885,41 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -5448,40 +4936,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 @@ -5497,21 +4985,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_addk_i32 s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5526,21 +5014,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5557,21 +5045,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5579,85 +5071,97 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s18 +; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -5667,35 +5171,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5711,37 +5219,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5758,27 +5270,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst ret void } -define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5787,7 +5299,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -5801,14 +5313,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -5817,7 +5329,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -5832,8 +5344,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5842,18 +5354,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB18_3 +; GFX12-NEXT: s_cbranch_execnz .LBB14_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -5866,21 +5378,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -5894,8 +5406,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB14_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5903,19 +5415,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: s_cbranch_execnz .LBB14_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -5929,14 +5441,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -5945,7 +5457,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -5960,8 +5472,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5971,19 +5483,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-NEXT: s_cbranch_execnz .LBB14_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -5996,13 +5508,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -6010,7 +5522,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -6024,8 +5536,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB14_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -6035,18 +5547,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: s_cbranch_execnz .LBB14_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -6059,20 +5571,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -6085,8 +5597,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB14_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -6094,18 +5606,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB14_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -6118,21 +5630,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX908-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -6145,8 +5657,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB14_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -6154,18 +5666,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: s_cbranch_execnz .LBB14_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -6178,15 +5690,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 -; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 @@ -6196,7 +5708,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -6209,8 +5721,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB14_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -6218,18 +5730,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB18_3 +; GFX8-NEXT: s_cbranch_execnz .LBB14_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -6241,7 +5753,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -6253,9 +5765,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[12:13], exec @@ -6271,7 +5783,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -6284,8 +5796,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB14_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -6295,19 +5807,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: s_cbranch_execnz .LBB14_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -6319,7 +5831,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -6331,9 +5843,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec @@ -6350,7 +5862,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -6363,8 +5875,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB14_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -6374,7 +5886,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB18_3 +; GFX6-NEXT: s_cbranch_execnz .LBB14_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 @@ -6382,7 +5894,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } @@ -6390,23 +5902,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 @@ -6439,25 +5951,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_add_i32 s4, s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v0 @@ -6484,16 +5996,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -6502,7 +6014,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v0 @@ -6536,24 +6048,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -6569,38 +6085,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -6615,36 +6135,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -6659,35 +6183,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6704,40 +6232,44 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6751,35 +6283,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6794,39 +6330,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6857,24 +6393,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_add_i32 s4, s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6901,23 +6437,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6949,23 +6485,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 -; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6980,38 +6520,42 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -7025,36 +6569,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s18 +; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -7068,35 +6616,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -7112,41 +6664,45 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7160,35 +6716,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7203,26 +6763,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7231,7 +6791,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -7245,15 +6805,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -7277,7 +6837,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX12-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -7292,8 +6852,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7302,18 +6862,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-NEXT: s_cbranch_execnz .LBB17_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -7326,7 +6886,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 @@ -7334,9 +6894,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_movk_i32 s10, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX940-NEXT: v_max_f32_e32 v4, v4, v9 @@ -7357,7 +6917,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -7371,8 +6931,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB17_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -7380,19 +6940,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB21_3 +; GFX940-NEXT: s_cbranch_execnz .LBB17_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -7406,16 +6966,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -7439,7 +6999,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7454,8 +7014,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7465,20 +7025,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: s_cbranch_execnz .LBB17_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -7491,14 +7051,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 @@ -7519,7 +7079,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -7533,8 +7093,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB17_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7544,18 +7104,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: s_cbranch_execnz .LBB17_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -7568,7 +7128,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 @@ -7576,9 +7136,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 -; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9 @@ -7597,7 +7157,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -7610,8 +7170,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -7619,18 +7179,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -7643,7 +7203,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 @@ -7651,9 +7211,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_movk_i32 s14, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 -; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX908-NEXT: v_max_f32_e32 v4, v4, v8 @@ -7673,7 +7233,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -7686,8 +7246,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB17_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7695,18 +7255,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: s_cbranch_execnz .LBB17_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -7719,15 +7279,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v8 @@ -7750,7 +7310,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -7763,8 +7323,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB17_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7772,18 +7332,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: s_cbranch_execnz .LBB17_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -7795,7 +7355,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7806,9 +7366,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7822,7 +7382,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -7835,8 +7395,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7845,19 +7405,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: s_cbranch_execnz .LBB17_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -7869,7 +7429,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7880,9 +7440,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7896,7 +7456,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -7909,8 +7469,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB17_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7920,14 +7480,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB21_3 +; GFX6-NEXT: s_cbranch_execnz .LBB17_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } @@ -7935,21 +7495,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; misc ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -7966,22 +7526,22 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_addk_i32 s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -7995,21 +7555,21 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -8027,22 +7587,26 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -8051,29 +7615,33 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 @@ -8081,29 +7649,33 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -8111,28 +7683,32 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 @@ -8140,28 +7716,32 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v0 @@ -8169,28 +7749,32 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v0 @@ -8199,22 +7783,22 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst ret float %result } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } -!0 = !{} + diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 2791162396a91..e0e6ccd72caea 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -13,28 +13,28 @@ ; float ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_addk_i32 s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -57,10 +57,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -68,27 +68,35 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -96,27 +104,31 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -125,27 +137,31 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -154,61 +170,69 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_addk_i32 s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -231,10 +255,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -242,53 +266,65 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s18 +; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -296,27 +332,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -324,43 +364,51 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst ret void } -define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -393,7 +441,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -454,7 +502,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -484,7 +532,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, exec_lo @@ -513,7 +561,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -572,7 +620,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 @@ -632,7 +680,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 @@ -692,7 +740,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[6:7], exec @@ -718,7 +766,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[6:7], exec @@ -745,357 +793,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result -} - -define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret float %result -} - -define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst ret float %result } @@ -1103,8 +801,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; double ; -------------------------------------------------------------------- -define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1112,14 +810,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 @@ -1137,33 +835,33 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 @@ -1182,43 +880,55 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_cbranch_execnz .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v10, v1 @@ -1229,29 +939,33 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v10, v1 @@ -1262,56 +976,64 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst ret double %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] @@ -1328,32 +1050,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_add_i32 s4, s6, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -1371,41 +1093,53 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_cbranch_execnz .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s8 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -1414,29 +1148,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v9, v2 ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX908-NEXT: v_mov_b32_e32 v2, v7 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -1445,44 +1183,52 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst ret void } -define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1493,7 +1239,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_readfirstlane_b32 s4, v9 ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 @@ -1508,14 +1254,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB5_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -1524,7 +1270,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v9 ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 @@ -1539,8 +1285,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB5_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] @@ -1549,19 +1295,19 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB7_3 +; GFX12-NEXT: s_cbranch_execnz .LBB5_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v6 ; GFX940-NEXT: v_mov_b32_e32 v6, v5 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -1576,7 +1322,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1585,7 +1331,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 @@ -1593,7 +1339,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10 @@ -1608,14 +1354,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB5_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -1624,7 +1370,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10 @@ -1639,8 +1385,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB5_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] @@ -1650,17 +1396,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-NEXT: s_cbranch_execnz .LBB5_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -1675,7 +1421,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_cbranch_execnz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1685,609 +1431,223 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v6 ; GFX90A-NEXT: v_mov_b32_e32 v6, v5 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 -; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX908-NEXT: v_mov_b32_e32 v14, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v13, v0 -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB7_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v13, v0 -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB7_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result -} - -define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret double %result -} - -define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s10, s8, 0x800 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 ; GFX908-NEXT: v_mov_b32_e32 v10, v1 ; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX908-NEXT: v_mov_b32_e32 v14, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB5_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s10, s8, 0x800 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 ; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB5_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB5_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst ret double %result } @@ -2295,26 +1655,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; half ; -------------------------------------------------------------------- -define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v4, s5 +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2336,26 +1697,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s5 +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_and_b32 s4, s6, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2372,28 +1733,29 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cbranch_execnz .LBB6_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v4, s5 +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2416,264 +1778,289 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v4, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s9 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s9 -; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s9 -; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s9 -; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst ret half %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2695,25 +2082,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_and_b32 s4, s6, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2730,27 +2117,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2773,237 +2161,261 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s9 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s9 -; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst ret void } -define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3019,7 +2431,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -3032,14 +2444,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -3054,7 +2466,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3069,8 +2481,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB8_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -3079,13 +2491,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB12_3 +; GFX12-NEXT: s_cbranch_execnz .LBB8_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -3096,7 +2508,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -3108,14 +2520,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_cbranch_execnz .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 @@ -3125,7 +2537,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -3139,8 +2551,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB8_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -3148,13 +2560,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB12_3 +; GFX940-NEXT: s_cbranch_execnz .LBB8_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -3167,7 +2579,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -3180,14 +2592,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -3202,7 +2614,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -3217,8 +2629,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB8_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -3228,13 +2640,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB12_3 +; GFX11-NEXT: s_cbranch_execnz .LBB8_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -3245,7 +2657,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -3257,13 +2669,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -3274,7 +2686,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -3288,8 +2700,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB8_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -3299,13 +2711,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB12_3 +; GFX10-NEXT: s_cbranch_execnz .LBB8_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -3316,7 +2728,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -3328,14 +2740,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 @@ -3344,7 +2756,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -3357,8 +2769,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB8_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -3366,13 +2778,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB8_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -3383,7 +2795,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -3395,14 +2807,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 @@ -3412,7 +2824,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -3425,8 +2837,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB8_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3434,13 +2846,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB12_3 +; GFX908-NEXT: s_cbranch_execnz .LBB8_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -3451,7 +2863,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -3463,14 +2875,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 @@ -3481,7 +2893,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -3494,8 +2906,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB8_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3503,13 +2915,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB12_3 +; GFX8-NEXT: s_cbranch_execnz .LBB8_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -3519,7 +2931,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -3530,15 +2942,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -3550,7 +2962,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -3563,8 +2975,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB8_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3572,14 +2984,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB12_3 +; GFX7-NEXT: s_cbranch_execnz .LBB8_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -3589,7 +3001,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -3600,15 +3012,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB8_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -3620,7 +3032,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -3633,8 +3045,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB8_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3642,7 +3054,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB12_3 +; GFX6-NEXT: s_cbranch_execnz .LBB8_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -3650,7 +3062,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst ret half %result } @@ -3658,26 +3070,27 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; bfloat ; -------------------------------------------------------------------- -define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v4, s5 +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3706,27 +3119,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s5 +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_and_b32 s4, s6, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3748,28 +3161,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cbranch_execnz .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v4, s5 +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3799,29 +3213,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v4, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 @@ -3829,121 +3247,133 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s9 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s9 -; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v3, v3, v5 @@ -3953,132 +3383,141 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s9 -; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s9 -; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_and_b32 s5, s4, -4 -; GFX12-NEXT: s_and_b32 s4, s4, 3 -; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s4, s6, 3 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -4107,26 +3546,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s4, 0x200 -; GFX940-NEXT: s_and_b32 s5, s4, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-NEXT: s_addk_i32 s6, 0x200 +; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_and_b32 s4, s6, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -4148,27 +3587,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s5, s4, -4 -; GFX11-NEXT: s_and_b32 s4, s4, 3 -; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_and_b32 s4, s6, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_not_b32 s6, s5 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -4198,28 +3638,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_addk_i32 s18, 0x200 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_and_b32 s4, s18, -4 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_and_b32 s4, s18, 3 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: s_and_b32 s9, s8, -4 -; GFX10-NEXT: s_and_b32 s8, s8, 3 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, 3 -; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 -; GFX10-NEXT: s_not_b32 s10, s9 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 @@ -4227,118 +3671,130 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s8, 0x200 -; GFX90A-NEXT: s_and_b32 s9, s8, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s9 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX90A-NEXT: s_and_b32 s8, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 -; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX90A-NEXT: s_not_b32 s11, s8 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_addk_i32 s18, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s18, -4 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s8, 0x200 -; GFX908-NEXT: s_and_b32 s9, s8, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s9 -; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX908-NEXT: s_and_b32 s8, s8, 3 -; GFX908-NEXT: s_lshl_b32 s10, s8, 3 -; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX908-NEXT: s_not_b32 s11, s8 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_addk_i32 s18, 0x200 +; GFX908-NEXT: s_and_b32 s4, s18, -4 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: s_lshl_b32 s6, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s8, 0x200 -; GFX8-NEXT: s_and_b32 s9, s8, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX8-NEXT: s_and_b32 s8, s8, 3 -; GFX8-NEXT: s_lshl_b32 s10, s8, 3 -; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 -; GFX8-NEXT: s_not_b32 s11, s8 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_addk_i32 s18, 0x200 +; GFX8-NEXT: s_and_b32 s4, s18, -4 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: s_lshl_b32 s6, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 @@ -4348,109 +3804,117 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s8, 0x200 -; GFX7-NEXT: s_and_b32 s9, s8, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX7-NEXT: s_and_b32 s8, s8, 3 -; GFX7-NEXT: s_lshl_b32 s10, s8, 3 -; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: s_addk_i32 s18, 0x200 +; GFX7-NEXT: s_and_b32 s4, s18, -4 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_lshl_b32 s6, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s11, s8 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s8, 0x200 -; GFX6-NEXT: s_and_b32 s9, s8, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX6-NEXT: s_and_b32 s8, s8, 3 -; GFX6-NEXT: s_lshl_b32 s10, s8, 3 -; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: s_addk_i32 s18, 0x200 +; GFX6-NEXT: s_and_b32 s4, s18, -4 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_lshl_b32 s6, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s11, s8 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst ret void } -define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4466,7 +3930,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -4479,14 +3943,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -4508,7 +3972,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -4523,8 +3987,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4533,13 +3997,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-NEXT: s_cbranch_execnz .LBB11_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4550,7 +4014,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -4562,15 +4026,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec @@ -4585,7 +4049,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -4599,8 +4063,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4608,13 +4072,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB15_3 +; GFX940-NEXT: s_cbranch_execnz .LBB11_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4627,7 +4091,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -4640,15 +4104,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -4670,7 +4134,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -4685,8 +4149,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4696,14 +4160,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-NEXT: s_cbranch_execnz .LBB11_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4714,7 +4178,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -4726,13 +4190,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -4747,7 +4211,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -4761,8 +4225,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB11_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4772,13 +4236,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_3 +; GFX10-NEXT: s_cbranch_execnz .LBB11_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4789,7 +4253,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -4801,15 +4265,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v4, v4, v11 @@ -4822,7 +4286,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -4835,8 +4299,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4844,13 +4308,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB11_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4861,7 +4325,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -4873,15 +4337,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v4, v4, v10 @@ -4895,7 +4359,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -4908,8 +4372,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB11_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4917,13 +4381,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_3 +; GFX908-NEXT: s_cbranch_execnz .LBB11_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -4934,7 +4398,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -4946,14 +4410,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v4, v4, v10 @@ -4969,7 +4433,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -4982,8 +4446,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB11_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4991,13 +4455,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_3 +; GFX8-NEXT: s_cbranch_execnz .LBB11_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -5007,7 +4471,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -5018,15 +4482,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -5039,7 +4503,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -5052,8 +4516,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB11_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5061,14 +4525,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB15_3 +; GFX7-NEXT: s_cbranch_execnz .LBB11_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -5078,7 +4542,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -5089,15 +4553,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB11_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -5110,7 +4574,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -5123,8 +4587,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB11_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5132,7 +4596,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB15_3 +; GFX6-NEXT: s_cbranch_execnz .LBB11_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -5140,7 +4604,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } @@ -5148,22 +4612,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -5180,22 +4644,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_addk_i32 s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -5210,22 +4674,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -5243,22 +4707,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -5267,57 +4735,65 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -5325,29 +4801,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -5358,34 +4838,38 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -5401,37 +4885,41 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -5448,40 +4936,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 @@ -5497,21 +4985,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_addk_i32 s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5526,21 +5014,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5557,21 +5045,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5579,85 +5071,97 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s18 +; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -5667,35 +5171,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5711,37 +5219,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5758,27 +5270,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst ret void } -define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5787,7 +5299,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -5801,14 +5313,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -5817,7 +5329,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -5832,8 +5344,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5842,18 +5354,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB18_3 +; GFX12-NEXT: s_cbranch_execnz .LBB14_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -5866,21 +5378,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -5894,8 +5406,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB14_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5903,19 +5415,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: s_cbranch_execnz .LBB14_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -5929,14 +5441,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -5945,7 +5457,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -5960,8 +5472,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5971,19 +5483,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-NEXT: s_cbranch_execnz .LBB14_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -5996,13 +5508,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -6010,7 +5522,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -6024,8 +5536,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB14_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -6035,18 +5547,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: s_cbranch_execnz .LBB14_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -6059,20 +5571,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -6085,8 +5597,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB14_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -6094,18 +5606,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB14_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -6118,21 +5630,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX908-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -6145,8 +5657,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB14_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -6154,18 +5666,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: s_cbranch_execnz .LBB14_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -6178,15 +5690,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 -; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 @@ -6196,7 +5708,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -6209,8 +5721,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB14_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -6218,18 +5730,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB18_3 +; GFX8-NEXT: s_cbranch_execnz .LBB14_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -6241,7 +5753,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -6253,9 +5765,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[12:13], exec @@ -6271,7 +5783,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -6284,8 +5796,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB14_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -6295,19 +5807,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: s_cbranch_execnz .LBB14_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -6319,7 +5831,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -6331,9 +5843,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB14_4 Depth 2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec @@ -6350,7 +5862,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -6363,8 +5875,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB14_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -6374,7 +5886,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB18_3 +; GFX6-NEXT: s_cbranch_execnz .LBB14_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 @@ -6382,7 +5894,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } @@ -6390,23 +5902,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 @@ -6439,25 +5951,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_add_i32 s4, s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v0 @@ -6484,16 +5996,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -6502,7 +6014,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v0 @@ -6536,24 +6048,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -6569,38 +6085,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -6615,36 +6135,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -6659,35 +6183,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6704,40 +6232,44 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6751,35 +6283,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6794,39 +6330,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6857,24 +6393,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_add_i32 s4, s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6901,23 +6437,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6949,23 +6485,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v1, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 -; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6980,38 +6520,42 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s8 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_addk_i32 s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s18 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -7025,36 +6569,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s8 -; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_addk_i32 s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s18 +; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -7068,35 +6616,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_addk_i32 s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -7112,41 +6664,45 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7160,35 +6716,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -7203,26 +6763,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7231,7 +6791,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -7245,15 +6805,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -7277,7 +6837,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX12-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -7292,8 +6852,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7302,18 +6862,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-NEXT: s_cbranch_execnz .LBB17_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -7326,7 +6886,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 @@ -7334,9 +6894,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_movk_i32 s10, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX940-NEXT: v_min_f32_e32 v4, v4, v9 @@ -7357,7 +6917,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -7371,8 +6931,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB17_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -7380,19 +6940,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB21_3 +; GFX940-NEXT: s_cbranch_execnz .LBB17_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -7406,16 +6966,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -7439,7 +6999,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7454,8 +7014,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7465,20 +7025,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: s_cbranch_execnz .LBB17_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -7491,14 +7051,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 @@ -7519,7 +7079,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -7533,8 +7093,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB17_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7544,18 +7104,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: s_cbranch_execnz .LBB17_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -7568,7 +7128,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 @@ -7576,9 +7136,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 -; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9 @@ -7597,7 +7157,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -7610,8 +7170,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -7619,18 +7179,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -7643,7 +7203,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 @@ -7651,9 +7211,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_movk_i32 s14, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 -; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX908-NEXT: v_min_f32_e32 v4, v4, v8 @@ -7673,7 +7233,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -7686,8 +7246,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB17_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7695,18 +7255,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: s_cbranch_execnz .LBB17_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -7719,15 +7279,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v8 @@ -7750,7 +7310,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -7763,8 +7323,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB17_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7772,18 +7332,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: s_cbranch_execnz .LBB17_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -7795,7 +7355,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7806,9 +7366,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7822,7 +7382,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -7835,8 +7395,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7845,19 +7405,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: s_cbranch_execnz .LBB17_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -7869,7 +7429,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7880,9 +7440,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB17_4 Depth 2 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7896,7 +7456,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -7909,8 +7469,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB17_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7920,14 +7480,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB21_3 +; GFX6-NEXT: s_cbranch_execnz .LBB17_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } @@ -7935,21 +7495,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; misc ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -7966,22 +7526,22 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_addk_i32 s6, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -7995,21 +7555,21 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_add_i32 s4, s6, 0x400 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -8027,22 +7587,26 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_addk_i32 s8, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_add_i32 s4, s18, 0x400 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -8051,29 +7615,33 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s8 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s10 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 @@ -8081,29 +7649,33 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s8 -; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s10, s8, 0x400 -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s10 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -8111,28 +7683,32 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s10, s8, 0x400 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 @@ -8140,28 +7716,32 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s10, s8, 0x400 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v0 @@ -8169,28 +7749,32 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s10, s8, 0x400 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v0 @@ -8199,22 +7783,22 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst ret float %result } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } -!0 = !{} + diff --git a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll index 16f29cc329976..08a997530d3c9 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { ; GISEL-LABEL: buffer_ptr_vector_ops: ; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -25,7 +25,7 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { ; ; SDAG-LABEL: buffer_ptr_vector_ops: ; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -60,16 +60,16 @@ main_body: define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace(1) %dest) { ; GISEL-LABEL: buffer_structs: ; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_ashr_i32 s3, s2, 31 -; GISEL-NEXT: s_lshl_b64 s[0:1], s[2:3], 5 +; GISEL-NEXT: s_ashr_i32 s1, s0, 31 +; GISEL-NEXT: v_mov_b32_e32 v4, s0 +; GISEL-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GISEL-NEXT: s_add_u32 s0, s8, s0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-NEXT: v_mov_b32_e32 v4, s2 ; GISEL-NEXT: s_addc_u32 s1, s9, s1 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -81,15 +81,15 @@ define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace ; ; SDAG-LABEL: buffer_structs: ; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 -; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: s_ashr_i32 s3, s2, 31 -; SDAG-NEXT: s_lshl_b64 s[0:1], s[2:3], 5 +; SDAG-NEXT: s_ashr_i32 s1, s0, 31 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; SDAG-NEXT: s_add_u32 s0, s8, s0 -; SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-NEXT: s_addc_u32 s1, s9, s1 ; SDAG-NEXT: buffer_store_dword v0, v0, s[4:7], 0 offen ; SDAG-NEXT: global_store_dword v4, v0, s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index b26d15ed3a1c8..8293280609517 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector2: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -19,7 +19,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -30,7 +30,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector2: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -40,7 +40,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector2: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -52,7 +52,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector2: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -67,7 +67,7 @@ entry: define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector4: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -80,7 +80,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: v_mov_b32_e32 v2, 7 @@ -93,7 +93,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector4: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -105,7 +105,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector4: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -119,7 +119,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector4: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -136,7 +136,7 @@ entry: define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector_v2i16: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x60005 @@ -146,7 +146,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector_v2i16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x60005 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -156,7 +156,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -165,7 +165,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector_v2i16: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -190,8 +190,8 @@ entry: define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: build_vector_v2i16_trunc: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -201,10 +201,10 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX8-LABEL: build_vector_v2i16_trunc: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 +; GFX8-NEXT: s_lshr_b32 s2, s4, 16 ; GFX8-NEXT: s_or_b32 s2, s2, 0x50000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -215,11 +215,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX10-LABEL: build_vector_v2i16_trunc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-NEXT: s_lshr_b32 s2, s4, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -228,11 +228,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX11-LABEL: build_vector_v2i16_trunc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_hl_b32_b16 s2, s2, 5 +; GFX11-NEXT: s_pack_hl_b32_b16 s2, s4, 5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -242,14 +242,14 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX940-LABEL: build_vector_v2i16_trunc: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, 5 -; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_lshr_b32 s2, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, 5 +; GFX940-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %srl = lshr i32 %a, 16 %trunc = trunc i32 %srl to i16 @@ -262,7 +262,7 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) { ; GFX6-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -277,7 +277,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 @@ -302,7 +302,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX11-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 @@ -316,7 +316,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s3, s3, 16 diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll index f1992d71eb1de..5d1647782b0d8 100644 --- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll @@ -50,7 +50,7 @@ define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 { ; GCN-NEXT: v_and_b32_e32 [[TMP:v[0-9]+]], 0x3ff, v31 ; GCN-NEXT: v_add_i32_e32 v0, vcc, [[TMP]], v0 ; GCN-NEXT: s_setpc_b64 -define hidden i32 @use_workitem_id_x(i32 %arg0) #0 { +define hidden i32 @use_workitem_id_x(i32 %arg0) #3 { %id = call i32 @llvm.amdgcn.workitem.id.x() %op = add i32 %id, %arg0 ret i32 %op @@ -64,7 +64,7 @@ define hidden i32 @use_workitem_id_x(i32 %arg0) #0 { ; GCN: v_mov_b32_e32 v0, 9 ; GCN: s_swappc_b64 ; GCN: v_add_f32_e32 -define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #0 { +define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #3 { %val = call float @use_workitem_id_x(i32 9) %op = fadd float %val, 1.0 store volatile float %op, ptr addrspace(1) undef @@ -112,3 +112,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind noinline } attributes #1 = { alwaysinline nounwind } attributes #2 = { nounwind readnone speculatable } +attributes #3 = { nounwind noinline "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index ed418070ecb50..6af45035d394f 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -273,8 +273,8 @@ entry: ret void } -attributes #0 = { nounwind noinline norecurse } -attributes #1 = { nounwind noinline norecurse } +attributes #0 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #2 = { nounwind noinline } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll index c62a082459105..06dec7e792389 100644 --- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll @@ -10,9 +10,9 @@ declare hidden void @callee() #0 define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_size !0 { ; CHECK-LABEL: known_x_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 @@ -30,9 +30,9 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_size !1 { ; CHECK-LABEL: known_y_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -49,9 +49,9 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_size !2 { ; CHECK-LABEL: known_z_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -68,9 +68,9 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_size !3 { ; CHECK-LABEL: known_yz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -87,9 +87,9 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_size !4 { ; CHECK-LABEL: known_xz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -107,9 +107,9 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_size !5 { ; CHECK-LABEL: known_xyz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, 0 ; CHECK-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index b711542be5a7f..8ef2d89e76d4e 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -1,5 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s + +target triple = "amdgcn-amd-amdhsa" ; GCN-LABEL: {{^}}use_dispatch_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll index 1d2523d364e55..b52e7918b27ab 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -1,5 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7,UNPACKED-TID %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,PACKED-TID %s +; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri < %s | llc -mcpu=gfx90a -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7,UNPACKED-TID %s +; RUN: opt -passes=amdgpu-attributor -mcpu=gfx90a -mattr=-xnack < %s | llc -mcpu=gfx90a -mattr=-xnack -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,PACKED-TID %s + +target triple = "amdgcn-amd-amdhsa" ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 5e6f377da28e1..9792c9dabac2f 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,4 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s +; RUN: opt -mcpu=kaveri -passes=amdgpu-attributor < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s + +target triple = "amdgcn-amd-amdhsa" ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 15ebdd70ae881..231d3d97c8f4f 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -8,7 +8,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; SI-LABEL: kernel: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -18,7 +18,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; VI-LABEL: kernel: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -28,7 +28,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; GFX11-LABEL: kernel: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -115,21 +115,32 @@ define amdgpu_kernel void @call_coldcc() #0 { ; SI-LABEL: call_coldcc: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s11, 0xe8f000 -; SI-NEXT: s_add_u32 s8, s8, s1 -; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_getpc_b64 s[0:1] -; SI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_mov_b32 s23, 0xe8f000 +; SI-NEXT: s_add_u32 s20, s20, s9 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_mov_b32 s14, s8 +; SI-NEXT: s_mov_b64 s[10:11], s[4:5] +; SI-NEXT: s_add_u32 s8, s2, 36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; SI-NEXT: s_addc_u32 s9, s3, 0 +; SI-NEXT: s_getpc_b64 s[2:3] +; SI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 -; SI-NEXT: s_mov_b64 s[0:1], s[8:9] -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b64 s[0:1], s[20:21] +; SI-NEXT: s_mov_b64 s[2:3], s[22:23] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -141,31 +152,49 @@ define amdgpu_kernel void @call_coldcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s1 +; VI-NEXT: s_add_u32 s88, s88, s9 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_getpc_b64 s[0:1] -; VI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s14, s8 +; VI-NEXT: s_add_u32 s8, s2, 36 +; VI-NEXT: s_addc_u32 s9, s3, 0 +; VI-NEXT: s_getpc_b64 s[2:3] +; VI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; VI-NEXT: s_mov_b64 s[10:11], s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_mov_b64 s[4:5], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] +; VI-NEXT: v_or_b32_e32 v31, v0, v2 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: call_coldcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 -; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_add_u32 s8, s2, 36 +; GFX11-NEXT: s_addc_u32 s9, s3, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_mov_b32 s12, s13 +; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b32 s13, s14 +; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @coldcc(float 1.0) @@ -177,21 +206,32 @@ define amdgpu_kernel void @call_fastcc() #0 { ; SI-LABEL: call_fastcc: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s11, 0xe8f000 -; SI-NEXT: s_add_u32 s8, s8, s1 -; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_getpc_b64 s[0:1] -; SI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_mov_b32 s23, 0xe8f000 +; SI-NEXT: s_add_u32 s20, s20, s9 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_mov_b32 s14, s8 +; SI-NEXT: s_mov_b64 s[10:11], s[4:5] +; SI-NEXT: s_add_u32 s8, s2, 36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; SI-NEXT: s_addc_u32 s9, s3, 0 +; SI-NEXT: s_getpc_b64 s[2:3] +; SI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 -; SI-NEXT: s_mov_b64 s[0:1], s[8:9] -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b64 s[0:1], s[20:21] +; SI-NEXT: s_mov_b64 s[2:3], s[22:23] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -203,31 +243,49 @@ define amdgpu_kernel void @call_fastcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s1 +; VI-NEXT: s_add_u32 s88, s88, s9 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_getpc_b64 s[0:1] -; VI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s14, s8 +; VI-NEXT: s_add_u32 s8, s2, 36 +; VI-NEXT: s_addc_u32 s9, s3, 0 +; VI-NEXT: s_getpc_b64 s[2:3] +; VI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; VI-NEXT: s_mov_b64 s[10:11], s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_mov_b64 s[4:5], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] +; VI-NEXT: v_or_b32_e32 v31, v0, v2 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: call_fastcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 -; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_add_u32 s8, s2, 36 +; GFX11-NEXT: s_addc_u32 s9, s3, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_mov_b32 s12, s13 +; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b32 s13, s14 +; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @fastcc(float 1.0) @@ -954,7 +1012,7 @@ define amdgpu_ps i16 @ret_ps_mesa_i16() { define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; SI-LABEL: amd_kernel_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s0, s0, s0 @@ -965,7 +1023,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; VI-LABEL: amd_kernel_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s0, s0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -974,7 +1032,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; GFX11-LABEL: amd_kernel_i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s0, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -992,7 +1050,7 @@ entry: define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; SI-LABEL: amd_kernel_v2i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[0:1], 0x9 +; SI-NEXT: s_load_dword s1, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1010,7 +1068,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v2i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_i32 s0, s0, s0 @@ -1024,7 +1082,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v2i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: v_add_nc_u16 v1, s0, s0 @@ -1049,7 +1107,7 @@ entry: define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; SI-LABEL: amd_kernel_v4i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[0:1], 0x9 +; SI-NEXT: s_load_dword s1, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1077,7 +1135,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v4i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 24 ; VI-NEXT: s_lshr_b32 s2, s0, 16 @@ -1099,7 +1157,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v4i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 @@ -1136,7 +1194,7 @@ entry: define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; SI-LABEL: amd_kernel_v3i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 2 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1160,7 +1218,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v3i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1180,7 +1238,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v3i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1212,7 +1270,7 @@ entry: define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; SI-LABEL: amd_kernel_v5i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 4 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1245,7 +1303,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v5i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 24 ; VI-NEXT: s_lshr_b32 s3, s0, 16 @@ -1273,7 +1331,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v5i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16 @@ -1312,7 +1370,7 @@ entry: define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; SI-LABEL: amd_kernel_v8i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1357,7 +1415,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v8i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s1, 24 ; VI-NEXT: s_lshr_b32 s3, s1, 16 @@ -1392,7 +1450,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v8i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1 @@ -1445,7 +1503,7 @@ entry: define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; SI-LABEL: amd_kernel_v16i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1524,7 +1582,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v16i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 24 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -1585,7 +1643,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v16i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s10, s3, 16 ; GFX11-NEXT: s_lshr_b32 s11, s3, 24 @@ -1666,7 +1724,7 @@ entry: define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; SI-LABEL: amd_kernel_v32i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s9, 0 ; SI-NEXT: s_mov_b32 s8, 16 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1816,7 +1874,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v32i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v10, 0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1932,7 +1990,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v32i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v3, 8, s2 ; GFX11-NEXT: v_lshrrev_b16 v7, 8, s3 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index a0499ef6d0f6a..f248708d16ea2 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -18,8 +18,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; CISI-LABEL: sadd64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -34,8 +34,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; VI-LABEL: sadd64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_add_u32 s0, s6, s0 @@ -48,12 +48,12 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX9-LABEL: sadd64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s6, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s3 +; GFX9-NEXT: s_add_u32 s0, s6, s0 +; GFX9-NEXT: s_addc_u32 s1, s7, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -62,12 +62,12 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1010-LABEL: sadd64rr: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_u32 s0, s6, s2 -; GFX1010-NEXT: s_addc_u32 s1, s7, s3 +; GFX1010-NEXT: s_add_u32 s0, s6, s0 +; GFX1010-NEXT: s_addc_u32 s1, s7, s1 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -76,8 +76,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W32-LABEL: sadd64rr: ; GFX1030W32: ; %bb.0: ; %entry ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s0, s6, s0 @@ -90,8 +90,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W64-LABEL: sadd64rr: ; GFX1030W64: ; %bb.0: ; %entry ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s0, s6, s0 @@ -104,8 +104,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: sadd64rr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s6, s0 ; GFX11-NEXT: s_addc_u32 s1, s7, s1 @@ -129,7 +129,7 @@ entry: define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: sadd64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -144,7 +144,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: sadd64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s2, 0x56789876 @@ -157,7 +157,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: sadd64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -169,7 +169,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: sadd64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -181,7 +181,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: sadd64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -193,7 +193,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: sadd64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -205,7 +205,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: sadd64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876 ; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234 @@ -229,7 +229,7 @@ entry: define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: vadd64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: vadd64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: vadd64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -266,7 +266,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: vadd64rr: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: vadd64rr: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, s2, v0 @@ -286,7 +286,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: vadd64rr: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s2, v0 @@ -296,11 +296,12 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: vadd64rr: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -322,7 +323,7 @@ entry: define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; CISI-LABEL: vadd64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CISI-NEXT: v_add_i32_e32 v0, vcc, 0x56789876, v0 ; CISI-NEXT: v_mov_b32_e32 v1, 0x1234 ; CISI-NEXT: s_mov_b32 s3, 0xf000 @@ -334,7 +335,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; VI-LABEL: vadd64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x56789876, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x1234 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -346,7 +347,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX9-LABEL: vadd64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x56789876, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -357,7 +358,8 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1010-LABEL: vadd64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1010-NEXT: s_mov_b32 null, 0 ; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2 @@ -367,7 +369,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1030W32-LABEL: vadd64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 @@ -377,7 +379,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1030W64-LABEL: vadd64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3] @@ -387,9 +389,11 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX11-LABEL: vadd64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -411,8 +415,8 @@ entry: define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: suaddo32: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -423,23 +427,23 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: suaddo32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_add_i32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: suaddo32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s2, s3 +; GFX9-NEXT: s_add_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -447,11 +451,11 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: suaddo32: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_i32 s0, s2, s3 +; GFX1010-NEXT: s_add_i32 s0, s0, s1 ; GFX1010-NEXT: v_mov_b32_e32 v1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: s_endpgm @@ -459,37 +463,37 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: suaddo32: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: s_add_i32 s2, s2, s3 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2 -; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1030W32-NEXT: s_add_i32 s0, s0, s1 +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX1030W32-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1030W32-NEXT: s_endpgm ; ; GFX1030W64-LABEL: suaddo32: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: s_add_i32 s2, s2, s3 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2 -; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1030W64-NEXT: s_add_i32 s0, s0, s1 +; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX1030W64-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1030W64-NEXT: s_endpgm ; ; GFX11-LABEL: suaddo32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s2, s2, s3 +; GFX11-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -513,28 +517,28 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: uaddo32_vcc_user: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CISI-NEXT: s_mov_b32 s3, 0xf000 -; CISI-NEXT: s_mov_b32 s2, -1 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; CISI-NEXT: s_mov_b32 s11, 0xf000 +; CISI-NEXT: s_mov_b32 s10, -1 +; CISI-NEXT: s_mov_b32 s2, s10 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s0, s4 -; CISI-NEXT: v_mov_b32_e32 v0, s9 -; CISI-NEXT: s_mov_b32 s1, s5 -; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 -; CISI-NEXT: s_mov_b32 s4, s6 -; CISI-NEXT: s_mov_b32 s5, s7 -; CISI-NEXT: s_mov_b32 s6, s2 -; CISI-NEXT: s_mov_b32 s7, s3 +; CISI-NEXT: s_mov_b32 s8, s4 +; CISI-NEXT: v_mov_b32_e32 v0, s13 +; CISI-NEXT: s_mov_b32 s9, s5 +; CISI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 +; CISI-NEXT: s_mov_b32 s0, s6 +; CISI-NEXT: s_mov_b32 s1, s7 +; CISI-NEXT: s_mov_b32 s3, s11 ; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; CISI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; CISI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CISI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: uaddo32_vcc_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -549,12 +553,12 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: uaddo32_vcc_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -563,11 +567,11 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1010-LABEL: uaddo32_vcc_user: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_add_co_u32 v1, s0, s2, s3 +; GFX1010-NEXT: v_add_co_u32 v1, s0, s0, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: global_store_byte v0, v2, s[6:7] @@ -576,8 +580,8 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W32-LABEL: uaddo32_vcc_user: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_add_co_u32 v1, s4, s4, s5 @@ -589,8 +593,8 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W64-LABEL: uaddo32_vcc_user: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_add_co_u32 v1, s[4:5], s4, s5 @@ -602,8 +606,8 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: uaddo32_vcc_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v1, s4, s4, s5 @@ -631,7 +635,7 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; CISI-LABEL: suaddo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CISI-NEXT: s_mov_b32 s11, 0xf000 ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -655,7 +659,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: suaddo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -675,7 +679,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: suaddo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s6, s4, s6 @@ -692,7 +696,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1010-LABEL: suaddo64: ; GFX1010: ; %bb.0: -; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s6, s4, s6 @@ -707,7 +711,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W32-LABEL: suaddo64: ; GFX1030W32: ; %bb.0: -; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s6, s4, s6 @@ -722,7 +726,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W64-LABEL: suaddo64: ; GFX1030W64: ; %bb.0: -; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s6, s4, s6 @@ -737,7 +741,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX11-LABEL: suaddo64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s6, s4, s6 ; GFX11-NEXT: s_addc_u32 s7, s5, s7 @@ -768,31 +772,31 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 { ; CISI-LABEL: vuaddo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CISI-NEXT: s_mov_b32 s3, 0xf000 -; CISI-NEXT: s_mov_b32 s2, -1 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; CISI-NEXT: s_mov_b32 s11, 0xf000 +; CISI-NEXT: s_mov_b32 s10, -1 +; CISI-NEXT: s_mov_b32 s2, s10 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s0, s4 -; CISI-NEXT: v_mov_b32_e32 v1, s9 -; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; CISI-NEXT: s_mov_b32 s8, s4 +; CISI-NEXT: v_mov_b32_e32 v1, s13 +; CISI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] -; CISI-NEXT: s_mov_b32 s1, s5 -; CISI-NEXT: s_mov_b32 s4, s6 -; CISI-NEXT: s_mov_b32 s5, s7 -; CISI-NEXT: s_mov_b32 s6, s2 -; CISI-NEXT: s_mov_b32 s7, s3 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] +; CISI-NEXT: s_mov_b32 s9, s5 +; CISI-NEXT: s_mov_b32 s0, s6 +; CISI-NEXT: s_mov_b32 s1, s7 +; CISI-NEXT: s_mov_b32 s3, s11 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CISI-NEXT: s_waitcnt expcnt(0) ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CISI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: vuaddo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v6, s1 @@ -809,14 +813,14 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: vuaddo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v2, v0, s[6:7] @@ -825,13 +829,13 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: vuaddo64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_add_co_u32 v0, s0, s2, v0 -; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1010-NEXT: v_add_co_u32 v0, s2, s0, v0 +; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s1, 0, s2 +; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] @@ -840,8 +844,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: vuaddo64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_add_co_u32 v0, s6, s4, v0 @@ -855,8 +859,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-LABEL: vuaddo64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_add_co_u32 v0, s[6:7], s4, v0 @@ -870,13 +874,15 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-LABEL: vuaddo64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s6, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11-NEXT: s_clause 0x1 @@ -903,8 +909,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; CISI-LABEL: ssub64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -919,8 +925,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; VI-LABEL: ssub64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_sub_u32 s0, s6, s0 @@ -933,12 +939,12 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX9-LABEL: ssub64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s6, s2 -; GFX9-NEXT: s_subb_u32 s1, s7, s3 +; GFX9-NEXT: s_sub_u32 s0, s6, s0 +; GFX9-NEXT: s_subb_u32 s1, s7, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -947,12 +953,12 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1010-LABEL: ssub64rr: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_u32 s0, s6, s2 -; GFX1010-NEXT: s_subb_u32 s1, s7, s3 +; GFX1010-NEXT: s_sub_u32 s0, s6, s0 +; GFX1010-NEXT: s_subb_u32 s1, s7, s1 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -961,8 +967,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W32-LABEL: ssub64rr: ; GFX1030W32: ; %bb.0: ; %entry ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s0, s6, s0 @@ -975,8 +981,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W64-LABEL: ssub64rr: ; GFX1030W64: ; %bb.0: ; %entry ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s0, s6, s0 @@ -989,8 +995,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: ssub64rr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s0, s6, s0 ; GFX11-NEXT: s_subb_u32 s1, s7, s1 @@ -1014,7 +1020,7 @@ entry: define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: ssub64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1029,7 +1035,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: ssub64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, 0x56789876, s2 @@ -1042,7 +1048,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: ssub64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1054,7 +1060,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: ssub64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1066,7 +1072,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: ssub64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1078,7 +1084,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: ssub64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1090,7 +1096,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: ssub64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2 ; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3 @@ -1114,7 +1120,7 @@ entry: define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: vsub64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1128,7 +1134,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: vsub64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_sub_u32_e32 v3, vcc, s2, v0 @@ -1140,7 +1146,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: vsub64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1151,7 +1157,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: vsub64rr: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0 @@ -1161,7 +1167,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: vsub64rr: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, s2, v0 @@ -1171,7 +1177,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: vsub64rr: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s2, v0 @@ -1181,11 +1187,12 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: vsub64rr: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_u32 v0, s2, s2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1207,7 +1214,7 @@ entry: define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; CISI-LABEL: vsub64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CISI-NEXT: v_sub_i32_e32 v0, vcc, 0x56789876, v0 ; CISI-NEXT: v_mov_b32_e32 v1, 0x1234 ; CISI-NEXT: s_mov_b32 s3, 0xf000 @@ -1219,7 +1226,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; VI-LABEL: vsub64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x56789876, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x1234 ; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -1231,7 +1238,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX9-LABEL: vsub64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0x56789876, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1242,7 +1249,8 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1010-LABEL: vsub64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1010-NEXT: s_mov_b32 null, 0 ; GFX1010-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2 @@ -1252,7 +1260,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1030W32-LABEL: vsub64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 @@ -1262,7 +1270,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1030W64-LABEL: vsub64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[2:3], 0x56789876, v0 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3] @@ -1272,9 +1280,11 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX11-LABEL: vsub64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -1297,8 +1307,8 @@ entry: define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: susubo32: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1309,23 +1319,23 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: susubo32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sub_i32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_sub_i32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: susubo32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s0, s2, s3 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -1333,11 +1343,11 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: susubo32: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_i32 s0, s2, s3 +; GFX1010-NEXT: s_sub_i32 s0, s0, s1 ; GFX1010-NEXT: v_mov_b32_e32 v1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: s_endpgm @@ -1345,37 +1355,37 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: susubo32: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: s_sub_i32 s2, s2, s3 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2 -; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1030W32-NEXT: s_sub_i32 s0, s0, s1 +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX1030W32-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1030W32-NEXT: s_endpgm ; ; GFX1030W64-LABEL: susubo32: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: s_sub_i32 s2, s2, s3 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2 -; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1030W64-NEXT: s_sub_i32 s0, s0, s1 +; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX1030W64-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1030W64-NEXT: s_endpgm ; ; GFX11-LABEL: susubo32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_sub_i32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1399,28 +1409,28 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: usubo32_vcc_user: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CISI-NEXT: s_mov_b32 s3, 0xf000 -; CISI-NEXT: s_mov_b32 s2, -1 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; CISI-NEXT: s_mov_b32 s11, 0xf000 +; CISI-NEXT: s_mov_b32 s10, -1 +; CISI-NEXT: s_mov_b32 s2, s10 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s0, s4 -; CISI-NEXT: v_mov_b32_e32 v0, s9 -; CISI-NEXT: s_mov_b32 s1, s5 -; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; CISI-NEXT: s_mov_b32 s4, s6 -; CISI-NEXT: s_mov_b32 s5, s7 -; CISI-NEXT: s_mov_b32 s6, s2 -; CISI-NEXT: s_mov_b32 s7, s3 +; CISI-NEXT: s_mov_b32 s8, s4 +; CISI-NEXT: v_mov_b32_e32 v0, s13 +; CISI-NEXT: s_mov_b32 s9, s5 +; CISI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 +; CISI-NEXT: s_mov_b32 s0, s6 +; CISI-NEXT: s_mov_b32 s1, s7 +; CISI-NEXT: s_mov_b32 s3, s11 ; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; CISI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; CISI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CISI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: usubo32_vcc_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1435,12 +1445,12 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: usubo32_vcc_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -1449,11 +1459,11 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1010-LABEL: usubo32_vcc_user: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_sub_co_u32 v1, s0, s2, s3 +; GFX1010-NEXT: v_sub_co_u32 v1, s0, s0, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: global_store_byte v0, v2, s[6:7] @@ -1462,8 +1472,8 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W32-LABEL: usubo32_vcc_user: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_sub_co_u32 v1, s4, s4, s5 @@ -1475,8 +1485,8 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W64-LABEL: usubo32_vcc_user: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], s4, s5 @@ -1488,8 +1498,8 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: usubo32_vcc_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_sub_co_u32 v1, s4, s4, s5 @@ -1517,7 +1527,7 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; CISI-LABEL: susubo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CISI-NEXT: s_mov_b32 s11, 0xf000 ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,7 +1551,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: susubo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, s4, s6 @@ -1561,7 +1571,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: susubo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s6, s4, s6 @@ -1578,7 +1588,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1010-LABEL: susubo64: ; GFX1010: ; %bb.0: -; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s6, s4, s6 @@ -1593,7 +1603,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W32-LABEL: susubo64: ; GFX1030W32: ; %bb.0: -; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s6, s4, s6 @@ -1608,7 +1618,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W64-LABEL: susubo64: ; GFX1030W64: ; %bb.0: -; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s6, s4, s6 @@ -1623,7 +1633,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX11-LABEL: susubo64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s6, s4, s6 ; GFX11-NEXT: s_subb_u32 s7, s5, s7 @@ -1654,31 +1664,31 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 { ; CISI-LABEL: vusubo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CISI-NEXT: s_mov_b32 s3, 0xf000 -; CISI-NEXT: s_mov_b32 s2, -1 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; CISI-NEXT: s_mov_b32 s11, 0xf000 +; CISI-NEXT: s_mov_b32 s10, -1 +; CISI-NEXT: s_mov_b32 s2, s10 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s0, s4 -; CISI-NEXT: v_mov_b32_e32 v1, s9 -; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; CISI-NEXT: s_mov_b32 s8, s4 +; CISI-NEXT: v_mov_b32_e32 v1, s13 +; CISI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 ; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; CISI-NEXT: s_mov_b32 s1, s5 -; CISI-NEXT: s_mov_b32 s4, s6 -; CISI-NEXT: s_mov_b32 s5, s7 -; CISI-NEXT: s_mov_b32 s6, s2 -; CISI-NEXT: s_mov_b32 s7, s3 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; CISI-NEXT: s_mov_b32 s9, s5 +; CISI-NEXT: s_mov_b32 s0, s6 +; CISI-NEXT: s_mov_b32 s1, s7 +; CISI-NEXT: s_mov_b32 s3, s11 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CISI-NEXT: s_waitcnt expcnt(0) ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CISI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: vusubo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v6, s1 @@ -1695,14 +1705,14 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: vusubo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v2, v0, s[6:7] @@ -1711,13 +1721,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: vusubo64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_sub_co_u32 v0, s0, s2, v0 -; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1010-NEXT: v_sub_co_u32 v0, s2, s0, v0 +; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s1, 0, s2 +; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] @@ -1726,8 +1736,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: vusubo64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_sub_co_u32 v0, s6, s4, v0 @@ -1741,8 +1751,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-LABEL: vusubo64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[6:7], s4, v0 @@ -1756,13 +1766,15 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-LABEL: vusubo64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_u32 v0, s6, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11-NEXT: s_clause 0x1 @@ -1792,8 +1804,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-LABEL: sudiv64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CISI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd ; CISI-NEXT: s_waitcnt lgkmcnt(0) ; CISI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; CISI-NEXT: s_mov_b32 s0, 0 @@ -1943,8 +1955,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; VI-LABEL: sudiv64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; VI-NEXT: s_mov_b32 s0, 0 @@ -2100,18 +2112,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GFX9-LABEL: sudiv64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] +; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[8:9] ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_u32 s0, 0, s2 -; GFX9-NEXT: s_subb_u32 s1, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2184,24 +2196,24 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: s_mul_i32 s0, s7, s0 ; GFX9-NEXT: s_add_u32 s11, s1, s0 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_i32 s0, s2, s10 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s11 +; GFX9-NEXT: s_mul_i32 s0, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s11 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s3, s11 +; GFX9-NEXT: s_mul_i32 s1, s9, s11 ; GFX9-NEXT: s_add_i32 s12, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s2, s11 +; GFX9-NEXT: s_mul_i32 s1, s8, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_sub_i32 s0, s7, s12 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s13, s0, s3 -; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s2, v0 +; GFX9-NEXT: s_subb_u32 s13, s0, s9 +; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s8, v0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s13, s13, 0 -; GFX9-NEXT: s_cmp_ge_u32 s13, s3 +; GFX9-NEXT: s_cmp_ge_u32 s13, s9 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 -; GFX9-NEXT: s_cmp_eq_u32 s13, s3 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v1 +; GFX9-NEXT: s_cmp_eq_u32 s13, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -2219,10 +2231,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: s_subb_u32 s0, s7, s12 -; GFX9-NEXT: s_cmp_ge_u32 s0, s3 +; GFX9-NEXT: s_cmp_ge_u32 s0, s9 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX9-NEXT: s_cmp_eq_u32 s0, s3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; GFX9-NEXT: s_cmp_eq_u32 s0, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2234,27 +2246,27 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s0, 0, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_sub_i32 s0, 0, s8 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s3, s0 -; GFX9-NEXT: s_add_i32 s3, s3, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s3 -; GFX9-NEXT: s_mul_i32 s7, s0, s2 -; GFX9-NEXT: s_sub_i32 s6, s6, s7 -; GFX9-NEXT: s_add_i32 s3, s0, 1 -; GFX9-NEXT: s_sub_i32 s7, s6, s2 -; GFX9-NEXT: s_cmp_ge_u32 s6, s2 -; GFX9-NEXT: s_cselect_b32 s0, s3, s0 -; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_add_i32 s3, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s6, s2 -; GFX9-NEXT: s_cselect_b32 s0, s3, s0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s0, s2, s0 +; GFX9-NEXT: s_add_i32 s2, s2, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s2 +; GFX9-NEXT: s_mul_i32 s3, s0, s8 +; GFX9-NEXT: s_sub_i32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_sub_i32 s6, s3, s8 +; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: s_cselect_b32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: .LBB16_3: @@ -2268,18 +2280,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-LABEL: sudiv64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] -; GFX1010-NEXT: s_mov_b32 s8, 0 -; GFX1010-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1010-NEXT: s_or_b64 s[2:3], s[6:7], s[8:9] +; GFX1010-NEXT: s_mov_b32 s2, 0 +; GFX1010-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1010-NEXT: ; %bb.1: -; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX1010-NEXT: s_sub_u32 s9, 0, s2 -; GFX1010-NEXT: s_subb_u32 s10, 0, s3 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX1010-NEXT: s_sub_u32 s3, 0, s8 +; GFX1010-NEXT: s_subb_u32 s10, 0, s9 ; GFX1010-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1010-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2290,11 +2302,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1010-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s11, s9, s0 -; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s1 +; GFX1010-NEXT: s_mul_i32 s11, s3, s0 +; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s1 ; GFX1010-NEXT: s_mul_i32 s12, s10, s1 ; GFX1010-NEXT: s_add_i32 s11, s13, s11 -; GFX1010-NEXT: s_mul_i32 s14, s9, s1 +; GFX1010-NEXT: s_mul_i32 s14, s3, s1 ; GFX1010-NEXT: s_add_i32 s11, s11, s12 ; GFX1010-NEXT: s_mul_hi_u32 s13, s1, s14 ; GFX1010-NEXT: s_mul_hi_u32 s15, s0, s14 @@ -2314,76 +2326,76 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1010-NEXT: s_addc_u32 s0, s0, s11 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s11, s9, s0 -; GFX1010-NEXT: s_mul_hi_u32 s12, s9, s1 +; GFX1010-NEXT: s_mul_i32 s11, s3, s0 +; GFX1010-NEXT: s_mul_hi_u32 s12, s3, s1 ; GFX1010-NEXT: s_mul_i32 s10, s10, s1 ; GFX1010-NEXT: s_add_i32 s11, s12, s11 -; GFX1010-NEXT: s_mul_i32 s9, s9, s1 +; GFX1010-NEXT: s_mul_i32 s3, s3, s1 ; GFX1010-NEXT: s_add_i32 s11, s11, s10 -; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s9 -; GFX1010-NEXT: s_mul_i32 s13, s0, s9 -; GFX1010-NEXT: s_mul_hi_u32 s9, s1, s9 +; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s3 +; GFX1010-NEXT: s_mul_i32 s13, s0, s3 +; GFX1010-NEXT: s_mul_hi_u32 s3, s1, s3 ; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s11 ; GFX1010-NEXT: s_mul_i32 s1, s1, s11 ; GFX1010-NEXT: s_mul_hi_u32 s10, s0, s11 -; GFX1010-NEXT: s_add_u32 s1, s9, s1 -; GFX1010-NEXT: s_addc_u32 s9, 0, s14 +; GFX1010-NEXT: s_add_u32 s1, s3, s1 +; GFX1010-NEXT: s_addc_u32 s3, 0, s14 ; GFX1010-NEXT: s_add_u32 s1, s1, s13 ; GFX1010-NEXT: s_mul_i32 s11, s0, s11 -; GFX1010-NEXT: s_addc_u32 s1, s9, s12 -; GFX1010-NEXT: s_addc_u32 s9, s10, 0 +; GFX1010-NEXT: s_addc_u32 s1, s3, s12 +; GFX1010-NEXT: s_addc_u32 s3, s10, 0 ; GFX1010-NEXT: s_add_u32 s1, s1, s11 -; GFX1010-NEXT: s_addc_u32 s9, 0, s9 +; GFX1010-NEXT: s_addc_u32 s3, 0, s3 ; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1010-NEXT: s_addc_u32 s0, s0, s9 +; GFX1010-NEXT: s_addc_u32 s0, s0, s3 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 ; GFX1010-NEXT: s_mul_i32 s10, s6, s0 -; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s0 +; GFX1010-NEXT: s_mul_hi_u32 s3, s6, s0 ; GFX1010-NEXT: s_mul_hi_u32 s11, s7, s0 ; GFX1010-NEXT: s_mul_i32 s0, s7, s0 ; GFX1010-NEXT: s_mul_hi_u32 s12, s6, s1 ; GFX1010-NEXT: s_mul_hi_u32 s13, s7, s1 ; GFX1010-NEXT: s_mul_i32 s1, s7, s1 ; GFX1010-NEXT: s_add_u32 s10, s12, s10 -; GFX1010-NEXT: s_addc_u32 s9, 0, s9 +; GFX1010-NEXT: s_addc_u32 s3, 0, s3 ; GFX1010-NEXT: s_add_u32 s1, s10, s1 -; GFX1010-NEXT: s_addc_u32 s1, s9, s13 -; GFX1010-NEXT: s_addc_u32 s9, s11, 0 +; GFX1010-NEXT: s_addc_u32 s1, s3, s13 +; GFX1010-NEXT: s_addc_u32 s3, s11, 0 ; GFX1010-NEXT: s_add_u32 s1, s1, s0 -; GFX1010-NEXT: s_addc_u32 s9, 0, s9 -; GFX1010-NEXT: s_mul_hi_u32 s0, s2, s1 -; GFX1010-NEXT: s_mul_i32 s11, s2, s9 -; GFX1010-NEXT: s_mul_i32 s12, s2, s1 +; GFX1010-NEXT: s_addc_u32 s3, 0, s3 +; GFX1010-NEXT: s_mul_hi_u32 s0, s8, s1 +; GFX1010-NEXT: s_mul_i32 s11, s8, s3 +; GFX1010-NEXT: s_mul_i32 s12, s8, s1 ; GFX1010-NEXT: s_add_i32 s0, s0, s11 ; GFX1010-NEXT: v_sub_co_u32 v0, s11, s6, s12 -; GFX1010-NEXT: s_mul_i32 s10, s3, s1 +; GFX1010-NEXT: s_mul_i32 s10, s9, s1 ; GFX1010-NEXT: s_add_i32 s0, s0, s10 -; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s2 +; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s8 ; GFX1010-NEXT: s_sub_i32 s10, s7, s0 ; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: s_subb_u32 s10, s10, s3 +; GFX1010-NEXT: s_subb_u32 s10, s10, s9 ; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 -; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 +; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v1 ; GFX1010-NEXT: s_subb_u32 s10, s10, 0 -; GFX1010-NEXT: s_cmp_ge_u32 s10, s3 +; GFX1010-NEXT: s_cmp_ge_u32 s10, s9 ; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s10, s3 +; GFX1010-NEXT: s_cmp_eq_u32 s10, s9 ; GFX1010-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1010-NEXT: s_add_u32 s10, s1, 1 ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1010-NEXT: s_addc_u32 s12, s9, 0 +; GFX1010-NEXT: s_addc_u32 s12, s3, 0 ; GFX1010-NEXT: s_add_u32 s13, s1, 2 -; GFX1010-NEXT: s_addc_u32 s14, s9, 0 +; GFX1010-NEXT: s_addc_u32 s14, s3, 0 ; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 +; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v0 ; GFX1010-NEXT: s_subb_u32 s0, s7, s0 ; GFX1010-NEXT: v_mov_b32_e32 v2, s13 -; GFX1010-NEXT: s_cmp_ge_u32 s0, s3 +; GFX1010-NEXT: s_cmp_ge_u32 s0, s9 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s0, s3 +; GFX1010-NEXT: s_cmp_eq_u32 s0, s9 ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s14 @@ -2391,13 +2403,13 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1010-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo ; GFX1010-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s2 ; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1010-NEXT: .LBB16_2: -; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX1010-NEXT: s_sub_i32 s1, 0, s2 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX1010-NEXT: s_sub_i32 s1, 0, s8 ; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2406,17 +2418,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_mul_hi_u32 s1, s0, s1 ; GFX1010-NEXT: s_add_i32 s0, s0, s1 ; GFX1010-NEXT: s_mul_hi_u32 s0, s6, s0 -; GFX1010-NEXT: s_mul_i32 s1, s0, s2 -; GFX1010-NEXT: s_add_i32 s3, s0, 1 +; GFX1010-NEXT: s_mul_i32 s1, s0, s8 +; GFX1010-NEXT: s_add_i32 s2, s0, 1 ; GFX1010-NEXT: s_sub_i32 s1, s6, s1 -; GFX1010-NEXT: s_sub_i32 s6, s1, s2 -; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 -; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 -; GFX1010-NEXT: s_cselect_b32 s1, s6, s1 -; GFX1010-NEXT: s_add_i32 s3, s0, 1 -; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 +; GFX1010-NEXT: s_sub_i32 s3, s1, s8 +; GFX1010-NEXT: s_cmp_ge_u32 s1, s8 +; GFX1010-NEXT: s_cselect_b32 s0, s2, s0 +; GFX1010-NEXT: s_cselect_b32 s1, s3, s1 +; GFX1010-NEXT: s_add_i32 s2, s0, 1 +; GFX1010-NEXT: s_cmp_ge_u32 s1, s8 ; GFX1010-NEXT: s_mov_b32 s1, 0 -; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 +; GFX1010-NEXT: s_cselect_b32 s0, s2, s0 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: .LBB16_3: @@ -2430,8 +2442,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-LABEL: sudiv64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] ; GFX1030W32-NEXT: s_mov_b32 s8, 0 @@ -2592,8 +2604,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-LABEL: sudiv64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; GFX1030W64-NEXT: s_mov_b32 s0, 0 @@ -2753,8 +2765,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-LABEL: sudiv64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] ; GFX11-NEXT: s_mov_b32 s8, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll index 8e773cad3b335..8a39a52cd25ea 100644 --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -27,7 +27,7 @@ entry: define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -36,7 +36,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; ; GFX900-LABEL: test_kern_stack: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX1010-LABEL: test_kern_stack: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 @@ -266,7 +266,7 @@ entry: define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_stack: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; ; GFX900-LABEL: test_force_fp_kern_stack: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 @@ -287,7 +287,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX1010-LABEL: test_force_fp_kern_stack: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 ; GFX1010-NEXT: s_mov_b32 s33, 0 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 @@ -509,7 +509,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; GFX803-LABEL: test_sgpr_offset_kernel: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -525,7 +525,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; ; GFX900-LABEL: test_sgpr_offset_kernel: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -541,7 +541,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; ; GFX1010-LABEL: test_sgpr_offset_kernel: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 3c8ea61b0d43b..b46cdb8ab3ba0 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -5,12 +5,12 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: test_loop: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0xa +; GCN-NEXT: s_load_dword s0, s[2:3], 0xa ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s2, -1 +; GCN-NEXT: s_cmp_eq_u32 s0, -1 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, -1 @@ -118,7 +118,7 @@ for.body: define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_true: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, -1 @@ -214,7 +214,7 @@ for.body: define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_false: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -303,7 +303,7 @@ for.body: define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_undef: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -393,7 +393,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v0, v0 -; GCN-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_bitcmp1_b32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll index 1588dde19cfb7..b23249570faa7 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr add ; ; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index da609bfa8edea..21e2a85ab18d9 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; ; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index 12ef7657b1913..237e06def1576 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -119,7 +119,7 @@ ret: ; GCN-LABEL: {{^}}sink_ubfe_i16: ; GCN-NOT: lshr -; VI: s_load_dword [[ARG:s[0-9]+]], s[0:1], 0x2c +; VI: s_load_dword [[ARG:s[0-9]+]], s[2:3], 0x2c ; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004 ; GCN: s_cbranch_scc{{[0-1]}} diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 397efb126053f..ea10547da6ab7 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -425,9 +425,9 @@ bb: define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) #0 { ; GFX900-LABEL: vload2_private: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s9 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] @@ -456,10 +456,10 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; FLATSCR-LABEL: vload2_private: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: s_mov_b32 s4, 0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] @@ -483,9 +483,9 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; GFX10_DEFAULT-LABEL: vload2_private: ; GFX10_DEFAULT: ; %bb.0: ; %entry -; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v2, 0 -; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s9 +; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s15 ; GFX10_DEFAULT-NEXT: s_addc_u32 s1, s1, 0 ; GFX10_DEFAULT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] @@ -514,11 +514,11 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; FLATSCR_GFX10-LABEL: vload2_private: ; FLATSCR_GFX10: ; %bb.0: ; %entry -; FLATSCR_GFX10-NEXT: s_add_u32 s2, s2, s5 -; FLATSCR_GFX10-NEXT: s_addc_u32 s3, s3, 0 -; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; FLATSCR_GFX10-NEXT: s_add_u32 s6, s6, s11 +; FLATSCR_GFX10-NEXT: s_addc_u32 s7, s7, 0 +; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR_GFX10-NEXT: s_mov_b32 s4, 0 ; FLATSCR_GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -545,7 +545,7 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; GFX11-LABEL: vload2_private: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index 84bd9b6f6c5d4..e1717a816de0d 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_add_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -40,7 +40,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_add_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -51,7 +51,9 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_add_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -75,7 +77,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_multi_use_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -95,7 +97,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_multi_use_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -115,7 +117,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_multi_use_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -129,13 +131,14 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_multi_use_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v2, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc @@ -158,7 +161,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_dbg_use_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -174,7 +177,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_dbg_use_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -191,7 +194,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_dbg_use_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -202,7 +205,9 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_dbg_use_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -227,7 +232,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_neg_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -244,7 +249,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_add_neg_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -262,7 +267,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_add_neg_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -274,13 +279,14 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_add_neg_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_floor_f32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -301,7 +307,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_non_clamp_max_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -318,7 +324,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_non_clamp_max_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -336,7 +342,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_non_clamp_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -348,13 +354,14 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_non_clamp_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -373,7 +380,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; SI-LABEL: v_clamp_add_src_f32_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -389,7 +396,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_add_src_f32_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -406,7 +413,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_add_src_f32_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -417,7 +424,9 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_clamp_add_src_f32_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -441,7 +450,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f16_denorm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -459,7 +468,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_add_src_f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -476,7 +485,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_add_src_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -487,7 +496,9 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_add_src_f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -511,7 +522,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; SI-LABEL: v_clamp_add_src_f16_no_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -529,7 +540,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -546,7 +557,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -557,7 +568,9 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -581,7 +594,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -598,7 +611,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_add_src_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -616,7 +629,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_add_src_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -628,7 +641,9 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_clamp_add_src_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -653,7 +668,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -669,7 +684,7 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_add_src_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -686,7 +701,7 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_add_src_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -697,7 +712,9 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_add_src_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -721,26 +738,26 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspace(1) %aptr, float %a) #0 { ; SI-LABEL: v_clamp_mac_to_mad: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mad_f32 v3, s8, s8, v2 clamp +; SI-NEXT: v_mad_f32 v3, s0, s0, v2 clamp ; SI-NEXT: v_add_f32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: v_clamp_mac_to_mad: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -758,28 +775,31 @@ define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_mac_to_mad: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_f32 v2, s2, s2, v1 clamp +; GFX9-NEXT: v_mad_f32 v2, s0, s0, v1 clamp ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_mac_to_mad: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: v_mul_f32_e64 v2, s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e64 v2, v2, v1 clamp +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -802,7 +822,7 @@ define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -826,7 +846,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -846,7 +866,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -857,7 +877,9 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -881,7 +903,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; SI-LABEL: v_clamp_add_src_v2f16_no_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -905,7 +927,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -925,7 +947,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -936,7 +958,9 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -960,7 +984,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -992,7 +1016,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1014,7 +1038,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1026,13 +1050,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1053,7 +1078,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1078,7 +1103,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1099,7 +1124,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1111,13 +1136,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1140,7 +1166,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1165,7 +1191,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1186,7 +1212,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1198,13 +1224,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1227,7 +1254,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1251,7 +1278,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1271,7 +1298,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1283,13 +1310,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1311,7 +1339,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1334,7 +1362,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1354,7 +1382,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1366,13 +1394,14 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1394,7 +1423,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_packed_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1419,7 +1448,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_no_clamp_add_packed_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1440,7 +1469,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_no_clamp_add_packed_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1452,13 +1481,14 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_no_clamp_add_packed_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1481,7 +1511,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 @@ -1505,7 +1535,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1523,7 +1553,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1536,7 +1566,9 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 947284506a297..9b6c50c10d90d 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -41,7 +41,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -52,7 +52,9 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -65,7 +67,9 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -89,7 +93,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -105,7 +109,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -122,7 +126,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -133,7 +137,9 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_clamp_neg_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -146,7 +152,9 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -171,7 +179,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -187,7 +195,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -204,7 +212,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -215,7 +223,9 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX11-LABEL: v_clamp_negabs_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -228,7 +238,9 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -255,7 +267,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negzero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -273,7 +285,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_negzero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -292,7 +304,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_negzero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -305,13 +317,14 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_negzero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -320,13 +333,14 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_negzero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, 0.5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -349,7 +363,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -367,7 +381,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -386,7 +400,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -399,13 +413,14 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -414,13 +429,14 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -440,7 +456,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_multi_use_max_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -461,7 +477,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_multi_use_max_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -482,7 +498,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_multi_use_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -497,14 +513,16 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_multi_use_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc @@ -515,14 +533,16 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_multi_use_max_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1 ; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -546,7 +566,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -563,7 +583,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -580,7 +600,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -591,7 +611,9 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -604,7 +626,9 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -628,7 +652,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -645,7 +669,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -662,7 +686,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -673,7 +697,9 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_clamp_neg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -686,7 +712,9 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -711,7 +739,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -728,7 +756,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -745,7 +773,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -756,7 +784,9 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX11-LABEL: v_clamp_negabs_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -769,7 +799,9 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -796,7 +828,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -812,7 +844,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -829,7 +861,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -840,7 +872,9 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -853,7 +887,9 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -877,7 +913,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -893,7 +929,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -910,7 +946,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -921,7 +957,9 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_clamp_neg_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -934,7 +972,9 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -959,7 +999,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -975,7 +1015,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -992,7 +1032,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -1003,7 +1043,9 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX11-LABEL: v_clamp_negabs_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -1016,7 +1058,9 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -1043,7 +1087,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1060,7 +1104,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX8-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1078,7 +1122,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1090,7 +1134,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1103,7 +1149,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1125,7 +1173,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_aby_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1141,7 +1189,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_aby_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1158,7 +1206,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_aby_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1169,7 +1217,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_aby_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1182,7 +1232,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_aby_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1204,7 +1256,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_bay_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1220,7 +1272,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_bay_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1237,7 +1289,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_bay_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1248,7 +1300,9 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_bay_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1261,7 +1315,9 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_bay_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1283,7 +1339,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_yab_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1299,7 +1355,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_yab_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1316,7 +1372,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_yab_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1327,7 +1383,9 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_yab_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1340,7 +1398,9 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_yab_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1362,7 +1422,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_yba_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1378,7 +1438,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_yba_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1395,7 +1455,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_yba_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1406,7 +1466,9 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_yba_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1419,7 +1481,9 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_yba_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1441,7 +1505,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_ayb_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1457,7 +1521,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_ayb_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1474,7 +1538,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_ayb_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1485,7 +1549,9 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_ayb_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1498,7 +1564,9 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_ayb_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1520,7 +1588,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_bya_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1536,7 +1604,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_bya_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1553,7 +1621,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_bya_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1564,7 +1632,9 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_bya_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1577,7 +1647,9 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_bya_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1599,7 +1671,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constants_to_one_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1611,7 +1683,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX8-LABEL: v_clamp_constants_to_one_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1623,7 +1695,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX9-LABEL: v_clamp_constants_to_one_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1632,8 +1704,10 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX11-LABEL: v_clamp_constants_to_one_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1642,8 +1716,10 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX12-LABEL: v_clamp_constants_to_one_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1659,7 +1735,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constants_to_zero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1670,7 +1746,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_clamp_constants_to_zero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1682,7 +1758,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_clamp_constants_to_zero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1691,8 +1767,10 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_clamp_constants_to_zero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1701,8 +1779,10 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_clamp_constants_to_zero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1718,7 +1798,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_preserve_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1730,7 +1810,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_clamp_constant_preserve_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0.5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1742,7 +1822,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_clamp_constant_preserve_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1751,8 +1831,10 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_clamp_constant_preserve_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1761,8 +1843,10 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_clamp_constant_preserve_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1778,7 +1862,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1790,7 +1874,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1802,7 +1886,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1811,7 +1895,9 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1821,7 +1907,9 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1838,7 +1926,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_qnan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1849,7 +1937,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX8-LABEL: v_clamp_constant_qnan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1861,7 +1949,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: v_clamp_constant_qnan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1870,8 +1958,10 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: v_clamp_constant_qnan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1880,8 +1970,10 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX12-LABEL: v_clamp_constant_qnan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1897,7 +1989,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_snan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1908,7 +2000,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX8-LABEL: v_clamp_constant_snan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1920,7 +2012,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: v_clamp_constant_snan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1929,8 +2021,10 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: v_clamp_constant_snan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1939,8 +2033,10 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX12-LABEL: v_clamp_constant_snan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1960,7 +2056,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1977,7 +2073,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1995,7 +2091,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2007,13 +2103,14 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2022,7 +2119,9 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2047,7 +2146,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; GFX6-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2063,7 +2162,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2080,7 +2179,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2091,7 +2190,9 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2104,7 +2205,9 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2129,7 +2232,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2146,7 +2249,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2164,7 +2267,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2176,13 +2279,14 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2191,7 +2295,9 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2215,7 +2321,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2232,7 +2338,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2250,7 +2356,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2262,13 +2368,14 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2277,7 +2384,9 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2302,7 +2411,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2318,7 +2427,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2335,7 +2444,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2346,7 +2455,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2359,7 +2470,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2381,7 +2494,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2397,7 +2510,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2414,7 +2527,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2425,7 +2538,9 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2438,7 +2553,9 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2460,7 +2577,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2476,7 +2593,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2493,7 +2610,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2504,7 +2621,9 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2517,7 +2636,9 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2539,7 +2660,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2555,7 +2676,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2572,7 +2693,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2583,7 +2704,9 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2596,7 +2719,9 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2618,7 +2743,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2634,7 +2759,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2651,7 +2776,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2662,7 +2787,9 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2675,7 +2802,9 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2697,7 +2826,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2713,7 +2842,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2730,7 +2859,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2741,7 +2870,9 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2754,7 +2885,9 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2776,7 +2909,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 { ; GFX6-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2788,7 +2921,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2800,7 +2933,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2809,7 +2942,9 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2819,8 +2954,10 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2836,7 +2973,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 { ; GFX6-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2848,7 +2985,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800001 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2860,7 +2997,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2869,7 +3006,9 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2879,8 +3018,10 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2896,7 +3037,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2918,7 +3059,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2937,7 +3078,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2948,7 +3089,9 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2961,7 +3104,9 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: v_clamp_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2985,7 +3130,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_elt: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3011,7 +3156,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_v2f16_undef_elt: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3035,7 +3180,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_v2f16_undef_elt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3046,7 +3191,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_v2f16_undef_elt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3059,7 +3206,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_clamp_v2f16_undef_elt: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3083,7 +3232,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_not_zero: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3107,7 +3256,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: v_clamp_v2f16_not_zero: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3128,7 +3277,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_clamp_v2f16_not_zero: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3141,14 +3290,16 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_clamp_v2f16_not_zero: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3157,14 +3308,16 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX12-LABEL: v_clamp_v2f16_not_zero: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3184,7 +3337,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_not_one: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3207,7 +3360,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_v2f16_not_one: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3228,7 +3381,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_v2f16_not_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3241,14 +3394,16 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_clamp_v2f16_not_one: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3257,14 +3412,16 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: v_clamp_v2f16_not_one: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3284,7 +3441,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3307,7 +3464,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: v_clamp_neg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3326,7 +3483,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_clamp_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3337,7 +3494,9 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_clamp_neg_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3350,7 +3509,9 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: v_clamp_neg_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3375,7 +3536,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3398,7 +3559,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_negabs_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3417,7 +3578,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_negabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3429,13 +3590,14 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_negabs_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3444,13 +3606,14 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_negabs_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3473,7 +3636,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neglo_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3496,7 +3659,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_neglo_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3515,7 +3678,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_neglo_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3526,7 +3689,9 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_neglo_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3539,7 +3704,9 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_neglo_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3566,7 +3733,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neghi_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3588,7 +3755,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_neghi_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3607,7 +3774,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_neghi_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3618,7 +3785,9 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_neghi_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3631,7 +3800,9 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_neghi_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3658,7 +3829,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_shuffle: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3680,7 +3851,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_v2f16_shuffle: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3699,7 +3870,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_v2f16_shuffle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3710,7 +3881,9 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_clamp_v2f16_shuffle: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3723,7 +3896,9 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: v_clamp_v2f16_shuffle: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3748,7 +3923,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3774,7 +3949,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3798,7 +3973,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3809,7 +3984,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3822,7 +3999,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3846,7 +4025,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3872,7 +4051,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3896,7 +4075,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3907,7 +4086,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3920,7 +4101,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3944,7 +4127,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 ; GFX6-LABEL: v_clamp_diff_source_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_load_dword s2, s[2:3], 0x2 @@ -3961,7 +4144,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_diff_source_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -3980,7 +4163,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_diff_source_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 @@ -3996,7 +4179,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_diff_source_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -4014,7 +4197,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_clamp_diff_source_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index b6948dab6bf9f..fad1d47f55fd7 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -30,7 +30,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -49,7 +49,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 @@ -96,7 +96,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX11-LABEL: cluster_load_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 @@ -155,7 +155,7 @@ bb: define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_valu_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -175,7 +175,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 @@ -223,7 +223,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX11-LABEL: cluster_load_valu_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll index 9edf566335925..dcd088e2bd988 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll @@ -9,7 +9,7 @@ ; GCN-NEXT: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} ; GCN-NEXT: global_store_dwordx2 v{{[0-9]+}}, v[[[LO]]:{{[0-9]+\]}}, s[{{[0-9:]+}}] -define amdgpu_kernel void @test_odd_int4(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) { +define amdgpu_kernel void @test_odd_int4(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i32 %lid @@ -24,7 +24,7 @@ bb: ; GCN: global_load_dwordx2 v[{{[0-9]*[02468]}}:{{[0-9]+}}], ; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} ; GCN: global_store_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v[{{[0-9]*[02468]:[0-9]*[13579]}}] -define amdgpu_kernel void @test_vector_creation() { +define amdgpu_kernel void @test_vector_creation() #0 { entry: %tmp231 = load <4 x i16>, ptr addrspace(1) undef, align 2 %vext466 = shufflevector <4 x i16> %tmp231, <4 x i16> undef, <8 x i32> @@ -35,3 +35,5 @@ entry: } declare i32 @llvm.amdgcn.workitem.id.x() + +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll index 9321bc262c4a4..3035a8579c8a6 100644 --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -12,7 +12,7 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 6 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 @@ -31,7 +31,7 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 6 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll index aa1ad16b2a56e..9d93609b1e881 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll @@ -1,30 +1,9 @@ ; REQUIRES: asserts -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV4 %s -; RUN: not llc --crash -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV5,COV56 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV6,COV56 %s +; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s +; RUN: not --crash llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s -; AMDGPUAttributor deletes the function "by accident" so it's never -; codegened with optimizations. +; CHECK: function must have been generated already -; OPT: .text -; OPT-NEXT: .section ".note.GNU-stack" -; OPT-NEXT: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" -; COV4-NEXT: .amdhsa_code_object_version 4 -; COV5-NEXT: .amdhsa_code_object_version 5 -; COV6-NEXT: .amdhsa_code_object_version 6 -; OPT-NEXT: .amdgpu_metadata -; OPT-NEXT: --- -; OPT-NEXT: amdhsa.kernels: [] -; OPT-NEXT: amdhsa.target: amdgcn-amd-amdhsa--gfx900 -; OPT-NEXT: amdhsa.version: -; OPT-NEXT: - 1 -; COV4: - 1 -; COV56: - 2 -; OPT: ... define internal i32 @func() { ret i32 0 } - -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 6bc8d29b3bf7c..75f5eda608e80 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -180,7 +180,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_4 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -380,7 +380,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: nested_if_if_else: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -614,7 +614,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: nested_if_else_if: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -911,10 +911,10 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-LABEL: s_endpgm_unsafe_barrier: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: s_cbranch_execz .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %bb.then -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -922,7 +922,7 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v1, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: .LBB4_2: ; %bb.end -; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_barrier ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index 33c0d90f94a39..df223b3ec1354 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -88,7 +88,7 @@ bb: define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: sub1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: sub1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -127,33 +127,33 @@ bb: define amdgpu_kernel void @add_adde(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: add_adde: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v4, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: add_adde: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v3, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -171,33 +171,33 @@ bb: define amdgpu_kernel void @adde_add(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: adde_add: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: adde_add: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -215,33 +215,33 @@ bb: define amdgpu_kernel void @sub_sube(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sube: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -259,35 +259,35 @@ bb: define amdgpu_kernel void @sub_sube_commuted(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube_commuted: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v4, vcc -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sube_commuted: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x64, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -306,33 +306,33 @@ bb: define amdgpu_kernel void @sube_sub(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sube_sub: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sube_sub: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -350,33 +350,33 @@ bb: define amdgpu_kernel void @zext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: zext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-NEXT: v_cmp_class_f32_e32 vcc, s0, v3 +; GCN-NEXT: v_cmp_class_f32_e32 vcc, s4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: zext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -392,33 +392,33 @@ bb: define amdgpu_kernel void @sext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: sext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-NEXT: v_cmp_class_f32_e32 vcc, s0, v3 +; GCN-NEXT: v_cmp_class_f32_e32 vcc, s4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -434,7 +434,7 @@ bb: define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add_and: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_max_u32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 @@ -478,7 +478,7 @@ bb: define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_sext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -493,7 +493,7 @@ define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_sext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -518,7 +518,7 @@ bb: define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_zext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -533,7 +533,7 @@ define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_zext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,33 +557,33 @@ bb: define amdgpu_kernel void @sub_addcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_addcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_addcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -601,33 +601,33 @@ bb: define amdgpu_kernel void @sub_subcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_subcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_subcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -646,7 +646,7 @@ bb: define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_zext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -665,7 +665,7 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_zext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -695,7 +695,7 @@ bb: define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_sext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -714,7 +714,7 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_sext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index 3a7100c5903eb..5fbcd0bf66999 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,12 +5,12 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 { ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x2 +; CHECK-NEXT: s_load_dword s0, s[6:7], 0x2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 3 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %if.then -; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CHECK-NEXT: s_and_b32 s4, s0, 0xffff ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_mul_i32 s6, s4, 0xaaab diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll index c27e44609c527..48bd8f9b80799 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadCombine: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadShuffle: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index e9dbce9026ca0..9e5dbe91504a0 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_copy_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -48,41 +48,40 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 @@ -104,7 +103,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -129,7 +128,7 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa ; ; VI-LABEL: test_copy_v4i8_x3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 @@ -163,51 +162,51 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s18, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s19, s3 -; SI-NEXT: s_mov_b32 s22, s2 -; SI-NEXT: s_mov_b32 s23, s3 -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s16, s8 -; SI-NEXT: s_mov_b32 s17, s9 -; SI-NEXT: s_mov_b32 s20, s10 -; SI-NEXT: s_mov_b32 s21, s11 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s19, s7 +; SI-NEXT: s_mov_b32 s22, s6 +; SI-NEXT: s_mov_b32 s23, s7 +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s16, s12 +; SI-NEXT: s_mov_b32 s17, s13 +; SI-NEXT: s_mov_b32 s20, s14 +; SI-NEXT: s_mov_b32 s21, s15 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 @@ -241,23 +240,22 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 @@ -273,23 +271,23 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 @@ -326,7 +324,7 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -365,7 +363,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p ; ; VI-LABEL: test_copy_v4i8_x2_extra_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 @@ -413,7 +411,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -433,7 +431,7 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -457,7 +455,7 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -477,7 +475,7 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -502,7 +500,7 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -525,7 +523,7 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -552,7 +550,7 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_volatile_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -569,7 +567,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p ; ; VI-LABEL: test_copy_v4i8_volatile_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -591,7 +589,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_volatile_store: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -618,7 +616,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ; ; VI-LABEL: test_copy_v4i8_volatile_store: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll index a0e76f9a47a8a..95d28c9749522 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll @@ -6,44 +6,44 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %e, ptr addrspace(1) %f, ptr addrspace(1) %pout.coerce) { ; RRLIST-LABEL: sccClobber: ; RRLIST: ; %bb.0: ; %entry -; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; RRLIST-NEXT: v_mov_b32_e32 v2, 0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) ; RRLIST-NEXT: s_load_dword s16, s[8:9], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; RRLIST-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; RRLIST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44 +; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x44 ; RRLIST-NEXT: s_load_dword s17, s[10:11], 0x0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) ; RRLIST-NEXT: s_min_i32 s4, s16, 0 -; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; RRLIST-NEXT: s_and_b64 s[0:1], vcc, exec -; RRLIST-NEXT: s_cselect_b32 s0, s16, s17 -; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3] -; RRLIST-NEXT: s_cselect_b32 s0, s4, s0 +; RRLIST-NEXT: s_and_b64 s[2:3], vcc, exec +; RRLIST-NEXT: s_cselect_b32 s2, s16, s17 +; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[0:1] +; RRLIST-NEXT: s_cselect_b32 s0, s4, s2 ; RRLIST-NEXT: v_mov_b32_e32 v0, s0 ; RRLIST-NEXT: global_store_dword v2, v0, s[14:15] ; RRLIST-NEXT: s_endpgm ; ; FAST-LABEL: sccClobber: ; FAST: ; %bb.0: ; %entry -; FAST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; FAST-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; FAST-NEXT: v_mov_b32_e32 v2, 0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) ; FAST-NEXT: s_load_dword s16, s[8:9], 0x0 -; FAST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; FAST-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; FAST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; FAST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44 +; FAST-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x44 ; FAST-NEXT: s_load_dword s17, s[10:11], 0x0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) ; FAST-NEXT: s_min_i32 s4, s16, 0 -; FAST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; FAST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; FAST-NEXT: s_and_b64 s[0:1], vcc, exec -; FAST-NEXT: s_cselect_b32 s0, s16, s17 -; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3] -; FAST-NEXT: s_cselect_b32 s0, s4, s0 +; FAST-NEXT: s_and_b64 s[2:3], vcc, exec +; FAST-NEXT: s_cselect_b32 s2, s16, s17 +; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[0:1] +; FAST-NEXT: s_cselect_b32 s0, s4, s2 ; FAST-NEXT: v_mov_b32_e32 v0, s0 ; FAST-NEXT: global_store_dword v2, v0, s[14:15] ; FAST-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll index 7dd95a02f136b..c57ee9cc6a1e2 100644 --- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll @@ -4,12 +4,12 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(4) %addrSrc) { ; GCN-LABEL: copy_to_scc: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:252 ; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 4decf39d04013..63b9d68123fa4 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -23,11 +23,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s2, s2 +; SI-NEXT: s_flbit_i32_b32 s2, s4 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -36,8 +36,8 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -62,36 +62,36 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_ctlz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b32 s0, s4 -; GFX10-NEXT: s_min_u32 s0, s0, 32 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_flbit_i32_b32 s2, s4 +; GFX10-NEXT: s_min_u32 s2, s2, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b32 s0, s4 -; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_flbit_i32_b32 s2, s4 +; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u32 s2, s2 +; GFX11-NEXT: s_clz_i32_u32 s2, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, 32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -107,7 +107,7 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -127,7 +127,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -164,7 +164,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_ctlz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -190,13 +190,14 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-LABEL: v_ctlz_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -213,7 +214,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -235,7 +236,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -277,7 +278,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -292,7 +293,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_ctlz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -307,9 +308,11 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-LABEL: v_ctlz_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -333,7 +336,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -359,7 +362,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -411,7 +414,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -430,7 +433,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_ctlz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -449,9 +452,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-LABEL: v_ctlz_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -480,7 +485,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -500,7 +505,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -550,7 +555,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_ctlz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -563,7 +568,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-GISEL-LABEL: v_ctlz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -576,7 +581,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX11-LABEL: v_ctlz_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] @@ -598,8 +603,8 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -612,8 +617,8 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -645,11 +650,11 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_ctlz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b64 s0, s[2:3] +; GFX10-NEXT: s_flbit_i32_b64 s0, s[0:1] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] @@ -658,12 +663,12 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-LABEL: s_ctlz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[0:1] +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -673,14 +678,14 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX11-LABEL: s_ctlz_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] +; GFX11-NEXT: s_clz_i32_u64 s0, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_u32 s2, s2, 64 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX11-NEXT: s_min_u32 s0, s0, 64 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: global_store_b64 v1, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -692,7 +697,7 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -706,7 +711,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -737,7 +742,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -748,7 +753,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -759,7 +764,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX11-LABEL: s_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -778,7 +783,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -799,7 +804,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -847,7 +852,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -862,7 +867,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -878,7 +883,9 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-LABEL: v_ctlz_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -905,7 +912,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -926,7 +933,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -974,7 +981,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -989,7 +996,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,18 +1012,20 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX11-LABEL: v_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 -; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp -; GFX11-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp +; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1033,7 +1042,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1052,7 +1061,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1090,7 +1099,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1102,7 +1111,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1117,8 +1126,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1140,7 +1151,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1159,7 +1170,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1197,7 +1208,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1209,7 +1220,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1224,8 +1235,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1248,7 +1261,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1270,7 +1283,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1313,7 +1326,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1328,7 +1341,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1343,14 +1356,16 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1370,7 +1385,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1392,7 +1407,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1435,7 +1450,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1450,7 +1465,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1465,14 +1480,16 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1492,7 +1509,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1510,7 +1527,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1552,7 +1569,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1563,7 +1580,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1583,8 +1600,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1606,7 +1623,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1624,7 +1641,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1674,7 +1691,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1689,7 +1706,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1705,7 +1722,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1733,7 +1750,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1752,7 +1769,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1795,7 +1812,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1807,7 +1824,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1829,13 +1846,14 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index d269eb680138b..f16f05811c185 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -29,11 +29,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s2 +; SI-NEXT: s_flbit_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -41,10 +41,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: s_flbit_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -64,13 +64,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, ptr addrspace(1) %out, align 4 @@ -80,7 +80,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -134,7 +134,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -154,7 +154,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -211,7 +211,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -232,7 +232,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -254,7 +254,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -295,7 +295,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -318,11 +318,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s2, 24 +; SI-NEXT: s_lshl_b32 s2, s4, 24 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -331,10 +331,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s2, 24 +; VI-NEXT: s_lshl_b32 s2, s4, 24 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -373,14 +373,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 24 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 24 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i8 %val, 0 @@ -392,11 +392,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_lshl_b32 s2, s4, 16 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -405,10 +405,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_lshl_b32 s2, s4, 16 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -447,14 +447,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 16 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 16 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i16 %val, 0 @@ -466,11 +466,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s2 +; SI-NEXT: s_flbit_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -478,10 +478,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: s_flbit_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -501,13 +501,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i32 %val, 0 @@ -519,7 +519,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -533,7 +533,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -561,7 +561,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -580,7 +580,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -601,7 +601,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -649,7 +649,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -672,7 +672,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -697,7 +697,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -753,7 +753,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -778,7 +778,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -809,7 +809,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -869,7 +869,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -899,7 +899,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -946,7 +946,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1050,7 +1050,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1113,7 +1113,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctlz_zero_undef_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1158,7 +1158,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1183,8 +1183,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,14 +1196,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_flbit_i32_b64 s0, s[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1225,14 +1225,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[0:1] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) @@ -1243,7 +1243,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: s_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1318,7 +1318,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,7 +1364,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -1388,7 +1388,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: v_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1454,7 +1454,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1498,7 +1498,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1534,7 +1534,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1558,7 +1558,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1577,7 +1577,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1637,7 +1637,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1655,7 +1655,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1726,7 +1726,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1750,7 +1750,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1799,7 +1799,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1850,7 +1850,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1888,7 +1888,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1913,7 +1913,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1934,7 +1934,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1972,7 +1972,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1997,7 +1997,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2018,7 +2018,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2057,7 +2057,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -2082,7 +2082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2103,7 +2103,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2142,7 +2142,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -2196,11 +2196,11 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s2, 14 +; SI-NEXT: s_lshl_b32 s2, s4, 14 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -2213,10 +2213,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s2, 14 +; VI-NEXT: s_lshl_b32 s2, s4, 14 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2270,18 +2270,18 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 14 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 -; GFX9-GISEL-NEXT: s_and_b32 s0, s0, 0x3ffff -; GFX9-GISEL-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3] offset:2 +; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 14 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 +; GFX9-GISEL-NEXT: s_and_b32 s2, s2, 0x3ffff +; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] offset:2 ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone store i18 %ctlz, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index b6359f1816979..40929d5883447 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -14,8 +14,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) ; ; VI-LABEL: s_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -72,7 +72,7 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -91,7 +91,7 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -142,8 +142,8 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: v_ctpop_add_chain_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -166,8 +166,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctpop_add_chain_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -239,8 +239,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind { ; SI-LABEL: v_ctpop_add_sgpr_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s12, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -259,8 +259,8 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctpop_add_sgpr_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -320,7 +320,7 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -344,7 +344,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -400,7 +400,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -430,7 +430,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -520,7 +520,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -562,7 +562,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v16i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -769,7 +769,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: v_ctpop_v16i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal ; ; VI-LABEL: v_ctpop_i16_add_inline_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) ; ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1160,7 +1160,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_literal: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_literal: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_movk_i32 s4, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1234,8 +1234,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s12, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -1254,8 +1254,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt ; ; VI-LABEL: v_ctpop_i16_add_var: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1315,8 +1315,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s12, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_var_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind { ; SI-LABEL: v_ctpop_i16_add_vvar_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1418,8 +1418,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v_ctpop_i16_add_vvar_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1487,8 +1487,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) { ; SI-LABEL: ctpop_i16_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: s_cmp_lg_u32 s5, 0 @@ -1517,8 +1517,8 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i16_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: s_cmp_lg_u32 s5, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 131ce14a7847c..1c16612bed37f 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -16,8 +16,8 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -92,8 +92,8 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind { ; SI-LABEL: v_ctpop_i64_user: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -115,8 +115,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctpop_i64_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -144,8 +144,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 ; ; VI-LABEL: s_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -178,38 +178,38 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] -; SI-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; SI-NEXT: s_bcnt1_i32_b64 s7, s[10:11] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; SI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] +; SI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; SI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s15, 0xf000 +; VI-NEXT: s_mov_b32 s14, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] -; VI-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; VI-NEXT: s_bcnt1_i32_b64 s7, s[10:11] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] +; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; VI-NEXT: s_endpgm %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> @@ -220,7 +220,7 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64 define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -242,7 +242,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -270,7 +270,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -298,7 +298,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -334,11 +334,11 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) { ; SI-LABEL: ctpop_i64_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 @@ -363,11 +363,11 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i64_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s8, s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dword s0, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 @@ -409,8 +409,8 @@ endif: define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind { ; SI-LABEL: s_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -423,8 +423,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val ; ; VI-LABEL: s_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +443,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind { ; SI-LABEL: s_ctpop_i65: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -460,8 +460,8 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) ; ; VI-LABEL: s_ctpop_i65: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -484,7 +484,7 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index ee2894a66fbfc..02b0b1cc28fa8 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -22,11 +22,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s2, s2 +; SI-NEXT: s_ff1_i32_b32 s2, s4 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -61,27 +61,27 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_cttz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b32 s0, s4 -; GFX10-NEXT: s_min_u32 s0, s0, 32 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_ff1_i32_b32 s2, s4 +; GFX10-NEXT: s_min_u32 s2, s2, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone store i32 %cttz, ptr addrspace(1) %out, align 4 @@ -91,7 +91,7 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -148,7 +148,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_cttz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -204,7 +204,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -246,7 +246,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_cttz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -284,7 +284,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -310,7 +310,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -362,7 +362,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -381,7 +381,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_cttz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -427,7 +427,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_cttz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -475,7 +475,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_cttz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -487,7 +487,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-GISEL-LABEL: v_cttz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -505,8 +505,8 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -519,8 +519,8 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -552,11 +552,11 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_cttz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b64 s0, s[2:3] +; GFX10-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] @@ -565,12 +565,12 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-LABEL: s_cttz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[0:1] +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -584,7 +584,7 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -598,7 +598,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,7 +629,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -640,7 +640,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-GISEL-LABEL: s_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -657,7 +657,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -726,7 +726,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -741,7 +741,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -766,7 +766,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -787,7 +787,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -835,7 +835,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -850,7 +850,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-GISEL-LABEL: v_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -876,7 +876,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -895,7 +895,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -933,7 +933,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -945,7 +945,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -970,7 +970,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -989,7 +989,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,7 +1039,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1065,7 +1065,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1087,7 +1087,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1130,7 +1130,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1145,7 +1145,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1235,7 +1235,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1250,7 +1250,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1275,7 +1275,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1293,7 +1293,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1375,7 +1375,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1393,7 +1393,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1456,7 +1456,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1499,7 +1499,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1542,7 +1542,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1554,7 +1554,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 392a44318b0a5..2491abe4bc1ce 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -16,11 +16,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s2 +; SI-NEXT: s_ff1_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -28,10 +28,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: s_ff1_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -51,13 +51,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, ptr addrspace(1) %out, align 4 @@ -67,7 +67,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -86,7 +86,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -121,7 +121,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -141,7 +141,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -219,7 +219,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -305,11 +305,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s2 +; SI-NEXT: s_ff1_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -317,10 +317,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: s_ff1_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -356,13 +356,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i8 %val, 0 @@ -374,11 +374,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s2 +; SI-NEXT: s_ff1_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -386,10 +386,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: s_ff1_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -425,13 +425,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i16 %val, 0 @@ -443,11 +443,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s2 +; SI-NEXT: s_ff1_i32_b32 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -455,10 +455,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s2 +; VI-NEXT: s_ff1_i32_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -478,13 +478,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i32 %val, 0 @@ -496,7 +496,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -538,7 +538,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -577,7 +577,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -622,7 +622,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -644,7 +644,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -668,7 +668,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -721,7 +721,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -745,7 +745,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -776,7 +776,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -836,7 +836,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -866,7 +866,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -913,7 +913,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1017,7 +1017,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1091,7 +1091,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1274,7 +1274,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -1435,7 +1435,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1453,7 +1453,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1498,7 +1498,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1522,7 +1522,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1544,7 +1544,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1597,7 +1597,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 3f513e120e141..96969a12b2c58 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -900,7 +900,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -933,7 +933,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX10-LABEL: load_i8_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -955,8 +955,8 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX11-LABEL: load_i8_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -976,7 +976,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -996,7 +996,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1013,7 +1013,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v2i8_to_v2f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,9 +1039,11 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v2i8_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1062,7 +1064,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1084,7 +1086,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1102,7 +1104,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v3i8_to_v3f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1130,8 +1132,10 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v3i8_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1153,7 +1157,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1175,7 +1179,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1194,7 +1198,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v4i8_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1224,9 +1228,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v4i8_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1253,7 +1259,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1281,7 +1287,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1312,7 +1318,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1355,8 +1361,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -1388,7 +1396,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -1426,7 +1434,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s8, 0x4000405 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1473,7 +1481,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1521,9 +1529,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2 @@ -1563,21 +1573,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 @@ -1586,7 +1596,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 @@ -1599,22 +1609,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,12 +1653,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 @@ -1704,11 +1714,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 @@ -1755,7 +1767,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1794,7 +1806,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1839,7 +1851,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1898,9 +1910,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v7i8_to_v7f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 @@ -1939,7 +1953,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1966,7 +1980,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1990,7 +2004,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2030,9 +2044,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v8i8_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2061,7 +2077,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2081,7 +2097,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2098,7 +2114,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2124,13 +2140,14 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: i8_zext_inreg_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2149,7 +2166,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2168,7 +2185,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2184,7 +2201,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2208,8 +2225,10 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2233,7 +2252,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -2251,7 +2270,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2266,7 +2285,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: i8_zext_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -2288,8 +2307,8 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: i8_zext_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2310,7 +2329,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2338,7 +2357,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2369,7 +2388,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2412,8 +2431,10 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -2444,7 +2465,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2463,7 +2484,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2479,7 +2500,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte0_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2503,8 +2524,10 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte0_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2525,7 +2548,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2544,7 +2567,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2560,7 +2583,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2584,8 +2607,10 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2607,7 +2632,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2626,7 +2651,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2642,7 +2667,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte2_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2666,8 +2691,10 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte2_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2689,7 +2716,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2708,7 +2735,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2724,7 +2751,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte3_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2748,8 +2775,10 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte3_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2771,7 +2800,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -2791,7 +2820,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -2811,7 +2840,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX10-LABEL: cvt_ubyte0_or_multiuse: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2839,15 +2868,17 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX11-LABEL: cvt_ubyte0_or_multiuse: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index fed4b9862dbfb..6799980c18439 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -8,7 +8,7 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -30,7 +30,7 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -52,7 +52,7 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -74,7 +74,7 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -96,7 +96,7 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -118,7 +118,7 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: nand: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -154,7 +154,7 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: max_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: max: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -198,7 +198,7 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: min_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -220,7 +220,7 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: min: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umax_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -264,7 +264,7 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -286,7 +286,7 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umin_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -308,7 +308,7 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -330,7 +330,7 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: cmpxchg: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 @@ -354,7 +354,7 @@ define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace( define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: xchg: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -376,7 +376,7 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: inc: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -398,7 +398,7 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: dec: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -420,7 +420,7 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fadd: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -457,7 +457,7 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fsub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -494,7 +494,7 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fmin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 @@ -519,7 +519,7 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fmax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 @@ -544,15 +544,15 @@ define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.swap: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_swap v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -566,15 +566,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -588,15 +588,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -610,15 +610,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -632,15 +632,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -654,15 +654,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -676,15 +676,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -698,15 +698,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -720,15 +720,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -742,15 +742,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -764,15 +764,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.inc: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_inc v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -786,15 +786,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.dec: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 ; CHECK-NEXT: buffer_atomic_dec v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -808,16 +808,16 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.cmpswap: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -831,16 +831,17 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fadd: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v1, 1.0 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: buffer_atomic_add_f32 v1, v0, s[4:7], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -855,17 +856,18 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fmin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -880,17 +882,18 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fmax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index 1e5ec361d154c..297fe7618672e 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 +; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp0_b32 s0, 0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: .LBB0_3: ; %bb41 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x48 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x48 ; CHECK-NEXT: v_mov_b32_e32 v8, s10 ; CHECK-NEXT: v_mov_b32_e32 v9, s11 ; CHECK-NEXT: v_mov_b32_e32 v10, s12 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll index f414565f78f11..8fa0068a237cd 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @eq_t(float %x) { ; GCN-LABEL: eq_t: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -21,7 +21,7 @@ define amdgpu_kernel void @eq_t(float %x) { define amdgpu_kernel void @ne_t(float %x) { ; GCN-LABEL: ne_t: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -38,7 +38,7 @@ define amdgpu_kernel void @ne_t(float %x) { define amdgpu_kernel void @eq_f(float %x) { ; GCN-LABEL: eq_f: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -55,7 +55,7 @@ define amdgpu_kernel void @eq_f(float %x) { define amdgpu_kernel void @ne_f(float %x) { ; GCN-LABEL: ne_f: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 8f31bb1fe0a81..f298a95c63485 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -7,11 +7,11 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_0_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s2, 16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -19,33 +19,33 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_0_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_lshl_b32 s2, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_0_i16: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_lshl_b32 s0, s4, 16 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: global_store_dword v0, v1, s[2:3] +; GFX906-NEXT: s_lshl_b32 s2, s4, 16 +; GFX906-NEXT: v_mov_b32_e32 v1, s2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_0_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -92,11 +92,11 @@ define i32 @divergent_vec_0_i16(i16 %a) { define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_i16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s2, 0xffff +; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -104,33 +104,33 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: global_store_dword v0, v1, s[2:3] +; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -177,11 +177,11 @@ define i32 @divergent_vec_i16_0(i16 %a) { define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; GCN-LABEL: uniform_vec_f16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s2, 0xffff +; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -189,33 +189,33 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; ; GFX9-LABEL: uniform_vec_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_f16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: global_store_dword v0, v1, s[2:3] +; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -262,7 +262,7 @@ define float @divergent_vec_f16_0(half %a) { define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_i16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -277,7 +277,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_i16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -290,7 +290,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_i16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_i16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -361,7 +361,7 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) { define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_LH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 @@ -376,7 +376,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_LH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3 @@ -386,7 +386,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX906-LABEL: uniform_vec_i16_LH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3 @@ -396,7 +396,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX11-LABEL: uniform_vec_i16_LH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -452,7 +452,7 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_HH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -466,7 +466,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_HH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3 @@ -476,7 +476,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX906-LABEL: uniform_vec_i16_HH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3 @@ -486,7 +486,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX11-LABEL: uniform_vec_i16_HH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -546,7 +546,7 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_f16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -561,7 +561,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_f16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -574,7 +574,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_f16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -587,7 +587,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_f16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -684,10 +684,10 @@ entry: define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ptr addrspace(1) %out) #0 { ; GCN-LABEL: build_vec_v2i16_undeflo_uniform: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GCN-NEXT: s_load_dword s4, s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u16 v0, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -698,35 +698,35 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ; ; GFX9-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read_u16_d16 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX906-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v0, s4 ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[2:3] +; GFX906-NEXT: global_store_dword v1, v0, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: ds_load_u16_d16 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll index 8c3155fc5c6ea..d99e9699c2789 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: uniform_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_i32 s2, s2, s3 @@ -25,7 +25,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: divergent_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -48,7 +48,7 @@ define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: uniform_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_i32 s2, s2, s3 @@ -69,7 +69,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: divergent_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll index c3a6cd5975a77..ae4d302e04a7c 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i16_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: liveins: $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 @@ -55,9 +55,9 @@ define i1 @divergent_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) { define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i32_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: liveins: $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s64) from %ir.x.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 @@ -106,9 +106,9 @@ define i1 @divergent_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) { define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i64_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: liveins: $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll index b0e1da3b8eecb..75d9dd924a4d6 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds1align1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_u8 v0, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds2align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -37,7 +37,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds2align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -52,7 +52,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds2align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_u16 v0, v0 @@ -68,7 +68,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds2align2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_u16 v0, v0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds4align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -104,7 +104,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds4align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -130,7 +130,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds4align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b32 v0, v0 @@ -146,7 +146,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds4align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 @@ -160,7 +160,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds4align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds4align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b32 v0, v0 @@ -190,7 +190,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds4align4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b32 v0, v0 @@ -206,7 +206,7 @@ define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds8align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -234,7 +234,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds8align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds8align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 @@ -291,7 +291,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds8align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4 @@ -311,7 +311,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds8align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -331,7 +331,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds8align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 @@ -347,7 +347,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds8align4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 @@ -363,7 +363,7 @@ define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds8align8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 @@ -379,7 +379,7 @@ define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -473,7 +473,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds12align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 @@ -489,7 +489,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 @@ -513,7 +513,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -539,7 +539,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds12align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 @@ -555,7 +555,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-LABEL: ds12align4: ; ALIGNED: ; %bb.0: -; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -569,7 +569,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds12align4: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -583,7 +583,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds12align4: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 @@ -599,7 +599,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align8: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2 @@ -613,7 +613,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align8: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -627,7 +627,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds12align8: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8 @@ -641,7 +641,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds12align8: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 @@ -657,7 +657,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds12align16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b96 v[0:2], v0 @@ -673,7 +673,7 @@ define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %o define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds16align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -716,7 +716,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds16align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -789,7 +789,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds16align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 @@ -805,7 +805,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds16align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 @@ -835,7 +835,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds16align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -867,7 +867,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds16align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 @@ -883,7 +883,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-LABEL: ds16align4: ; ALIGNED: ; %bb.0: -; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -897,7 +897,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds16align4: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 @@ -911,7 +911,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds16align4: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 @@ -927,7 +927,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds16align8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 @@ -943,7 +943,7 @@ define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds16align16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b128 v[0:3], v0 diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll index aa1d44c31606b..31bbe6fbbaa14 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: ds_read32_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -47,7 +47,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_20: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -91,7 +91,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_400_back: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -136,7 +136,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_8192: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96 @@ -172,7 +172,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] @@ -206,7 +206,7 @@ bb: } ; GCN-LABEL: ds_read64_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -246,7 +246,7 @@ bb: } ; GCN-LABEL: ds_read64_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] @@ -280,7 +280,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -316,7 +316,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_400_back: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -352,7 +352,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_8192: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96 @@ -379,7 +379,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[BASE:v[0-9]+]], vcc, 4, [[BASE]] @@ -406,7 +406,7 @@ bb: } ; GCN-LABEL: ds_write64_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -437,7 +437,7 @@ bb: } ; GCN-LABEL: ds_write64_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[BASE]], vcc, 8, [[BASE]] diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll index 5814b8a8ceda4..7d75f1947b51a 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll @@ -9,7 +9,7 @@ ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8 ; GCN: s_waitcnt lgkmcnt({{[0-9]+}}) -define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { +define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 24 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -37,7 +37,7 @@ define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 -define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { +define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -67,7 +67,7 @@ define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104 -define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { +define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 24 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -96,7 +96,7 @@ define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104 -define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { +define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -115,3 +115,5 @@ define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspa store float %sum, ptr addrspace(1) %out, align 4 ret void } + +attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 7b9b130e1cf79..41a9d7999e80a 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -36,8 +36,9 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { ; ; GFX11-LABEL: write_ds_sub0_offset0_global: ; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:12 ; GFX11-NEXT: s_endpgm @@ -53,7 +54,7 @@ entry: define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 { ; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dword s0, s[0:1], 0x0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-NEXT: s_mov_b64 vcc, 0 @@ -73,7 +74,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0 @@ -90,7 +91,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b @@ -106,10 +107,11 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -135,7 +137,7 @@ entry: define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 { ; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[0:1], 0x0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: s_mov_b64 vcc, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -154,7 +156,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v4, 0 @@ -170,7 +172,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b @@ -185,7 +187,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0x7b :: v_dual_mov_b32 v3, 0 @@ -233,7 +235,9 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 { ; ; GFX11-LABEL: add_x_shl_max_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535 ; GFX11-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -275,8 +279,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 ; GFX11-NEXT: s_endpgm @@ -319,8 +324,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 ; GFX11-NEXT: s_endpgm @@ -361,8 +367,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 ; GFX11-NEXT: s_endpgm @@ -407,7 +414,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:456 @@ -455,8 +463,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 @@ -503,9 +512,10 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 ; GFX11-NEXT: s_endpgm @@ -521,7 +531,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 { ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[0:1], 0x0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 ; CI-NEXT: s_mov_b64 vcc, 0 @@ -542,7 +552,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0 @@ -560,7 +570,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 @@ -578,11 +588,12 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0x7b +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -637,9 +648,10 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() # ; ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 777a8f3fef1c1..b72cd7e1d1eca 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -15,7 +15,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -51,7 +51,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -64,7 +64,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -88,7 +88,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:1028 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -102,7 +102,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -126,7 +126,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -142,7 +142,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -184,7 +184,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: s_mov_b32 s2, 0 @@ -202,7 +202,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_barrier ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -245,7 +245,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -301,7 +301,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -319,7 +319,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -352,7 +352,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -370,7 +370,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) % ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -406,7 +406,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -419,7 +419,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -449,7 +449,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -487,7 +487,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -501,7 +501,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -522,13 +522,11 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[2:3], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:34 ; CI-NEXT: ds_read_u8 v3, v1 offset:32 ; CI-NEXT: ds_read_u8 v4, v1 offset:3 @@ -537,15 +535,13 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: ds_read_u8 v7, v1 ; CI-NEXT: ds_read_u8 v8, v1 offset:33 ; CI-NEXT: ds_read_u8 v1, v1 offset:35 -; CI-NEXT: s_waitcnt lgkmcnt(5) +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -554,6 +550,7 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -561,8 +558,8 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-ALIGNED-LABEL: unaligned_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -585,17 +582,17 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -615,13 +612,11 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[2:3], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:11 ; CI-NEXT: ds_read_u8 v3, v1 offset:9 ; CI-NEXT: ds_read_u8 v4, v1 offset:8 @@ -630,15 +625,13 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: ds_read_u8 v7, v1 offset:5 ; CI-NEXT: ds_read_u8 v8, v1 offset:10 ; CI-NEXT: ds_read_u8 v1, v1 offset:12 -; CI-NEXT: s_waitcnt lgkmcnt(5) +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -647,6 +640,7 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -654,8 +648,8 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -678,17 +672,17 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -708,44 +702,41 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_2_simple_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[2:3], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 ; CI-NEXT: ds_read_u16 v2, v1 offset:32 ; CI-NEXT: ds_read_u16 v3, v1 offset:2 ; CI-NEXT: ds_read_u16 v4, v1 ; CI-NEXT: ds_read_u16 v1, v1 offset:34 -; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: s_waitcnt lgkmcnt(2) +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_or_b32_e32 v3, v3, v4 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_add_f32_e32 v2, v3, v1 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 ; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 ; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] @@ -753,12 +744,12 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ; ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -781,7 +772,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -794,7 +785,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -817,7 +808,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -830,7 +821,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -854,7 +845,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[1:2], v0 ; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +859,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -889,15 +880,15 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_read2_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[2:3], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v0 ; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1 ; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] @@ -907,13 +898,13 @@ define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: misaligned_read2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -938,7 +929,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -950,7 +941,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -968,7 +959,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -980,7 +971,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -1000,7 +991,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b128 v[0:3], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1013,7 +1004,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b128 v[0:3], v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1035,7 +1026,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384 ; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1049,7 +1040,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1068,12 +1059,11 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 { ; CI-LABEL: sgemm_inner_loop_read2_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; CI-NEXT: s_lshl_b32 s0, s2, 2 -; CI-NEXT: s_add_i32 s1, s0, 0xc20 -; CI-NEXT: s_addk_i32 s0, 0xc60 -; CI-NEXT: v_mov_b32_e32 v0, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_lshl_b32 s4, s6, 2 +; CI-NEXT: s_add_i32 s5, s4, 0xc20 +; CI-NEXT: s_addk_i32 s4, 0xc60 +; CI-NEXT: v_mov_b32_e32 v0, s5 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 @@ -1081,24 +1071,29 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; CI-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(4) ; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_add_f32_e32 v0, v0, v2 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v5 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v6 ; CI-NEXT: v_add_f32_e32 v0, v0, v7 ; CI-NEXT: v_add_f32_e32 v0, v0, v8 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_add_f32_e32 v0, v0, v9 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: sgemm_inner_loop_read2_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s2, s2, 2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_lshl_b32 s2, s6, 2 ; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 ; GFX9-NEXT: s_addk_i32 s2, 0xc60 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -1109,16 +1104,12 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; GFX9-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 @@ -1172,28 +1163,28 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dword s4, s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %load = load <2 x i32>, ptr addrspace(3) %in, align 4 store <2 x i32> %load, ptr addrspace(1) %out, align 8 @@ -1203,28 +1194,28 @@ define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dword s4, s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %load = load i64, ptr addrspace(3) %in, align 4 store i64 %load, ptr addrspace(1) %out, align 8 @@ -1234,8 +1225,8 @@ define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @ds_read_diff_base_interleaving( ; CI-LABEL: ds_read_diff_base_interleaving: ; CI: ; %bb.0: ; %bb -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1265,10 +1256,10 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; ; GFX9-LABEL: ds_read_diff_base_interleaving: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 @@ -1470,7 +1461,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; CI-NEXT: ds_read_u8 v6, v0 offset:66 ; CI-NEXT: ds_read_u8 v0, v0 offset:65 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 @@ -1497,7 +1488,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7) ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7 @@ -1514,7 +1505,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 06908d21e5355..9f191fa69f654 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_one_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_one_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -44,7 +44,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -60,7 +60,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_two_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -85,7 +85,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_volatile_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -105,7 +105,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_f32_volatile_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -131,7 +131,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_volatile_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_f32_volatile_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -182,7 +182,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -199,7 +199,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace ; ; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ; kill: killed $vgpr4 @@ -229,7 +229,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -244,7 +244,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_subreg2_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -268,7 +268,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg4_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 @@ -283,7 +283,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_subreg4_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -307,7 +307,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_max_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -323,7 +323,7 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_max_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -348,7 +348,7 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_too_far_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -368,7 +368,7 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_too_far_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -394,7 +394,7 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_x2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -413,7 +413,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr ; ; GFX9-LABEL: simple_write2_two_val_f32_x2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -450,7 +450,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -469,7 +469,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa ; ; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -506,21 +506,21 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: write2_ptr_subreg_arg_two_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x6 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x6 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[4:5] +; CI-NEXT: s_mov_b64 s[8:9], s[4:5] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: s_mov_b64 s[4:5], s[6:7] -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: v_mov_b32_e32 v1, s8 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: s_mov_b64 s[2:3], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v1, s12 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: v_mov_b32_e32 v3, s13 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: ds_write_b32 v1, v2 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -529,14 +529,14 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: ds_write_b32 v0, v1 offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -566,7 +566,7 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -580,7 +580,7 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_one_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -601,15 +601,15 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; CI-NEXT: s_load_dword s0, s[0:1], 0x4 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dword s4, s[2:3], 0x4 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 @@ -618,11 +618,11 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) ; ; GFX9-LABEL: misaligned_simple_write2_one_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 @@ -642,15 +642,15 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; CI-NEXT: s_load_dword s0, s[0:1], 0x4 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dword s4, s[2:3], 0x4 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 @@ -675,11 +675,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x10 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 @@ -702,11 +702,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[2:3], 0x10 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5 @@ -726,7 +726,7 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -742,7 +742,7 @@ define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_two_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc @@ -868,11 +868,11 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 { ; CI-LABEL: write2_sgemm_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s0, s[0:1], 0x0 -; CI-NEXT: s_lshl_b32 s1, s2, 2 +; CI-NEXT: s_lshl_b32 s1, s6, 2 ; CI-NEXT: s_add_i32 s2, s1, 0xc20 ; CI-NEXT: s_addk_i32 s1, 0xc60 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -890,8 +890,8 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, ; ; GFX9-LABEL: write2_sgemm_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 -; GFX9-NEXT: s_lshl_b32 s2, s2, 2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX9-NEXT: s_lshl_b32 s2, s6, 2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_add_i32 s1, s2, 0xc20 @@ -945,12 +945,12 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_v4f32_superreg_align4: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 -; CI-NEXT: s_load_dword s4, s[0:1], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dword s4, s[2:3], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s0 @@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 -; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 @@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 -; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3 diff --git a/llvm/test/CodeGen/AMDGPU/early-inline.ll b/llvm/test/CodeGen/AMDGPU/early-inline.ll index c1a049cf055cf..02ab2a065c0ef 100644 --- a/llvm/test/CodeGen/AMDGPU/early-inline.ll +++ b/llvm/test/CodeGen/AMDGPU/early-inline.ll @@ -25,6 +25,7 @@ entry: ; CHECK-LABEL: @alias_caller( ; CHECK-NOT: call +; CHECK: {{^[}]}} define amdgpu_kernel void @alias_caller(i32 %x) { entry: %res = call i32 @c_alias(i32 %x) diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll index d958dde01c3f8..554cb140f4292 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll @@ -80,9 +80,11 @@ ; R600-NOT: .amd_amdgpu_hsa_metadata ; R600-NOT: .amd_amdgpu_pal_metadata -define amdgpu_kernel void @elf_notes() { +define amdgpu_kernel void @elf_notes() #0 { ret void } +attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 86ec6269b1c9b..32b9f9cb97095 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -6,10 +6,10 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] -; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4 -; CHECK-NEXT: s_add_u32 s24, s24, s7 +; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s14, s[6:7], 0x4 +; CHECK-NEXT: s_add_u32 s24, s24, s13 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s6, 8 +; CHECK-NEXT: s_bitcmp1_b32 s14, 8 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17] ; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index c744ace37a831..54fb1dc5c0527 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -93,7 +93,7 @@ bb: define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s2, s2 ; GFX7-NEXT: s_cmp_lt_u32 s0, s2 @@ -120,7 +120,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s0, s2, s2 ; GFX9-NEXT: s_cmp_lt_u32 s0, s2 @@ -146,7 +146,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX10-LABEL: s_add_co_br_user: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s1, s0, s0 ; GFX10-NEXT: s_cmp_lt_u32 s1, s0 @@ -172,7 +172,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX11-LABEL: s_add_co_br_user: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s1, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index db3ea4df52981..ee1df9aa0d6ce 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -180,8 +180,8 @@ entry: } ; GCN-LABEL: {{^}}float8_extelt: -; GCN-DAG: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-DAG: s_load_dword [[S0:s[0-9]+]], s[0:1], 0x2c +; GCN-DAG: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DAG: s_load_dword [[S0:s[0-9]+]], s[2:3], 0x2c ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 @@ -411,10 +411,10 @@ entry: ; GCN-LABEL: {{^}}bit4_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_lshl_b32 s2, s4, 3 ; GCN-NEXT: s_lshr_b32 s2, 0x1000100, s2 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 70011e56d016e..f4ec16db55d68 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: extract_vector_elt_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -36,7 +36,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: extract_vector_elt_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 @@ -62,8 +62,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s1, s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -77,8 +77,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -95,8 +95,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 @@ -119,8 +119,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -139,15 +139,15 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dword v2, v[1:2] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -162,12 +162,14 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -193,7 +195,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 { ; SI-LABEL: extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -208,7 +210,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; VI-LABEL: extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -222,7 +224,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; GFX11-LABEL: extract_vector_elt_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -247,8 +249,8 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b32 s4, s4, 4 @@ -262,8 +264,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: dynamic_extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -278,8 +280,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s4, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -300,7 +302,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_extractelement_v4f16_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -317,7 +319,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_extractelement_v4f16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -336,7 +338,9 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_extractelement_v4f16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -359,7 +363,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -380,7 +384,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -403,20 +407,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2] -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -434,7 +439,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_01: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -451,7 +456,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_01: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -468,7 +473,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_01: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -495,7 +500,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x1 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -512,7 +517,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -529,7 +534,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_23: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -556,8 +561,8 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 @@ -608,8 +613,8 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -651,43 +656,46 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[1:4], v1, s[6:7] +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[6:7] ; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -704,8 +712,8 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 @@ -794,8 +802,8 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -869,78 +877,81 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] -; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7] offset:16 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] +; GFX11-NEXT: global_load_b128 v[4:7], v4, s[6:7] offset:16 ; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX11-NEXT: s_cmp_eq_u32 s0, 9 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX11-NEXT: s_cmp_eq_u32 s0, 11 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX11-NEXT: s_cmp_eq_u32 s0, 13 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX11-NEXT: s_cmp_eq_u32 s0, 15 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index b69852da24744..d670d69947361 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -81,7 +81,7 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x ; SI: buffer_store_short ; SI: buffer_store_short -; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[0:1], 0x24 +; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[2:3], 0x24 ; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[#LOAD + 2]] ; GFX89-DAG: buffer_store_short [[VLOAD0]], off ; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[#LOAD + 3]] @@ -100,9 +100,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x ; SI: s_load_dwordx2 s ; SI: s_load_dwordx2 s -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x24 -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x4c -; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54 +; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[2:3], 0x24 +; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[2:3], 0x4c +; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[2:3], 0x54 ; GCN-NOT: {{buffer|flat|global}} diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index 331fe26160d41..164352ef75b3b 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -133,8 +133,8 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x ; isTypeDesirableForOp in SimplifyDemandedBits ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c -; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 +; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x4c +; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[6:7], 0x28 ; VI-NOT: {{flat|buffer|global}} ; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 @@ -147,8 +147,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c -; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 +; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x4c +; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[6:7], 0x28 ; VI-NOT: {{flat|buffer|global}} ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]] @@ -162,7 +162,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30 +; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x30 ; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 @@ -179,7 +179,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10 +; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x10 ; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll index d5464ce6aa8a3..06da7eea0b47d 100644 --- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -8,8 +8,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @bitcast_int_to_vector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %b) { ; GCN-LABEL: bitcast_int_to_vector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s12, s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s12, s[2:3], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -38,8 +38,8 @@ define amdgpu_kernel void @bitcast_int_to_vector_extract_0(ptr addrspace(1) %out define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, double %b) { ; GCN-LABEL: bitcast_fp_to_vector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -68,8 +68,8 @@ define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(ptr addrspace(1) %out, define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %b) { ; GCN-LABEL: bitcast_int_to_fpvector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s12, s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s12, s[2:3], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(ptr addrspace(1) %o define amdgpu_kernel void @no_extract_volatile_load_extract0(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: no_extract_volatile_load_extract0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -122,7 +122,7 @@ entry: define amdgpu_kernel void @no_extract_volatile_load_extract2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: no_extract_volatile_load_extract2: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -146,8 +146,8 @@ entry: define amdgpu_kernel void @no_extract_volatile_load_dynextract(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { ; GCN-LABEL: no_extract_volatile_load_dynextract: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s12, s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s12, s[2:3], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index f34824cd6cefe..21799ab79b839 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: s_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -23,8 +23,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: s_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: s_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -66,8 +66,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: s_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -78,8 +78,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: s_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: s_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -102,10 +102,10 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: s_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -120,8 +120,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -132,8 +132,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; VI-LABEL: s_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -144,8 +144,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; GFX9-LABEL: s_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -156,10 +156,10 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GFX11-LABEL: s_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -174,7 +174,7 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -187,7 +187,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -200,7 +200,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX9-LABEL: s_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -212,7 +212,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX11-LABEL: s_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -231,12 +231,12 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) { ; CI-LABEL: fabs_fold_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -247,8 +247,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; VI-LABEL: fabs_fold_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -260,8 +260,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; GFX9-LABEL: fabs_fold_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -273,13 +273,13 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; GFX11-LABEL: fabs_fold_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3 +; GFX11-NEXT: v_mul_f16_e64 v1, |s4|, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -293,7 +293,7 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -307,7 +307,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -332,7 +332,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_fabs_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -354,8 +356,8 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-LABEL: fabs_free_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -366,8 +368,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; VI-LABEL: fabs_free_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -378,8 +380,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX9-LABEL: fabs_free_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -390,10 +392,10 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-LABEL: fabs_free_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -411,7 +413,7 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_fold_self_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -437,7 +439,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: v_fabs_fold_self_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -455,7 +457,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_fabs_fold_self_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -468,14 +470,15 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_fabs_fold_self_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -493,8 +496,8 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 { ; CI-LABEL: v_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -521,8 +524,8 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -542,28 +545,30 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s6 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -582,7 +587,7 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -605,7 +610,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; VI-LABEL: v_extract_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -624,7 +629,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX9-LABEL: v_extract_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -640,7 +645,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX11-LABEL: v_extract_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] @@ -673,7 +680,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_no_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -691,7 +698,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; VI-LABEL: v_extract_fabs_no_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -709,7 +716,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX9-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] @@ -723,7 +730,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX11-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 07581ade57ccd..60e19dcd48f1e 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -39,25 +39,25 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fabsf_free: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fabsf_free: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_bitset0_b32 s0, 31 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %bc= bitcast i32 %in to float @@ -69,25 +69,25 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_bitset0_b32 s0, 31 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %in) @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fabs_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-LABEL: fabs_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s3, 31 ; VI-NEXT: s_bitset0_b32 s2, 31 @@ -131,26 +131,26 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s3, 31 -; SI-NEXT: s_bitset0_b32 s2, 31 -; SI-NEXT: s_bitset0_b32 s1, 31 -; SI-NEXT: s_bitset0_b32 s0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_bitset0_b32 s7, 31 +; SI-NEXT: s_bitset0_b32 s6, 31 +; SI-NEXT: s_bitset0_b32 s5, 31 +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_bitset0_b32 s3, 31 @@ -202,7 +202,7 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) { ; SI-LABEL: fabs_fold: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +215,7 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i ; ; VI-LABEL: fabs_fold: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 @@ -232,23 +232,23 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, |s0|, 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_add_f32_e64 v0, |s4|, 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bitpreserve_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_add_f32_e64 v2, |s0|, 1.0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_f32_e64 v2, |s2|, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %in.bc = bitcast float %in to i32 diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll index cdc6b5a48d0a6..7352fcdd071d5 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @fadd_f16( ; SI-LABEL: fadd_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s2, s10 @@ -35,8 +35,8 @@ define amdgpu_kernel void @fadd_f16( ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s2, s10 @@ -59,8 +59,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-SDAG-LABEL: fadd_f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s3, s11 @@ -87,8 +87,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-GISEL-LABEL: fadd_f16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: s_mov_b32 s10, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -111,8 +111,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-FAKE16-SDAG-LABEL: fadd_f16: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, -1 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s11 @@ -137,8 +137,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-FAKE16-GISEL-LABEL: fadd_f16: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s10, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -196,7 +196,7 @@ entry: define amdgpu_kernel void @fadd_f16_imm_a( ; SI-LABEL: fadd_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -216,7 +216,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; VI-LABEL: fadd_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -234,7 +234,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-SDAG-LABEL: fadd_f16_imm_a: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -256,7 +256,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-GISEL-LABEL: fadd_f16_imm_a: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -274,7 +274,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -294,7 +294,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -340,7 +340,7 @@ entry: define amdgpu_kernel void @fadd_f16_imm_b( ; SI-LABEL: fadd_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -360,7 +360,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; VI-LABEL: fadd_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -378,7 +378,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-SDAG-LABEL: fadd_f16_imm_b: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -400,7 +400,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-GISEL-LABEL: fadd_f16_imm_b: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -418,7 +418,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -438,7 +438,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -484,8 +484,8 @@ entry: define amdgpu_kernel void @fadd_v2f16( ; SI-LABEL: fadd_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -518,8 +518,8 @@ define amdgpu_kernel void @fadd_v2f16( ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -544,11 +544,13 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-SDAG-LABEL: fadd_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] @@ -565,8 +567,10 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-GISEL-LABEL: fadd_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 @@ -584,11 +588,13 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34 -; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 +; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] @@ -605,8 +611,10 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 @@ -657,7 +665,7 @@ entry: define amdgpu_kernel void @fadd_v2f16_imm_a( ; SI-LABEL: fadd_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -684,7 +692,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; VI-LABEL: fadd_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -705,10 +713,12 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-SDAG-LABEL: fadd_v2f16_imm_a: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 @@ -722,7 +732,9 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-GISEL-LABEL: fadd_v2f16_imm_a: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] @@ -737,10 +749,12 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0 @@ -754,7 +768,9 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] @@ -796,7 +812,7 @@ entry: define amdgpu_kernel void @fadd_v2f16_imm_b( ; SI-LABEL: fadd_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -823,7 +839,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; VI-LABEL: fadd_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -844,10 +860,12 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-SDAG-LABEL: fadd_v2f16_imm_b: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 @@ -861,7 +879,9 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-GISEL-LABEL: fadd_v2f16_imm_b: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] @@ -876,10 +896,12 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0 @@ -893,7 +915,9 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 4bfaa6e90bdfe..7252c69cb1cf7 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -73,7 +73,7 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -89,7 +89,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -108,7 +108,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -117,7 +117,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -127,7 +127,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -219,7 +219,7 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) @@ -246,7 +246,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -256,7 +256,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -265,7 +265,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -274,7 +274,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -284,7 +284,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -358,7 +358,7 @@ define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -368,7 +368,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -378,7 +378,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -387,7 +387,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -396,7 +396,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -406,7 +406,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align4: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 581b7b4cff9ed..7af972b96ec68 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -21,7 +21,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_undef_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_undef_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v0, v0, s[0:1] @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_undef_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_undef_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -76,7 +76,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] @@ -87,7 +87,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; CI-LABEL: v_test_canonicalize_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] @@ -119,10 +119,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 { ; VI-LABEL: s_test_canonicalize_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -130,34 +130,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; ; GFX9-LABEL: s_test_canonicalize_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f16_e64 v1, s4, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; CI-LABEL: s_test_canonicalize_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_load_dword s0, s[2:3], 0xb ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_canonicalize_var_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, s2, s2 +; GFX11-NEXT: v_max_f16_e64 v1, s4, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -168,35 +169,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ret void } -define half @s_test_canonicalize_arg(half %x) #1 { -; VI-LABEL: s_test_canonicalize_arg: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: s_test_canonicalize_arg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: s_test_canonicalize_arg: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: s_test_canonicalize_arg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %canonicalized = call half @llvm.canonicalize.f16(half %x) - ret half %canonicalized -} - define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 { ; VI-LABEL: v_test_canonicalize_build_vector_v2f16: ; VI: ; %bb.0: @@ -239,7 +211,7 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -251,7 +223,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -262,7 +234,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; CI-LABEL: v_test_canonicalize_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -275,7 +247,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -295,7 +267,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -307,7 +279,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -318,7 +290,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -331,7 +303,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -352,7 +324,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -364,7 +336,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -375,7 +347,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; CI-LABEL: v_test_canonicalize_fneg_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -388,7 +360,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -408,7 +380,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 { ; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -420,7 +392,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -431,7 +403,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -444,7 +416,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -464,7 +436,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 { ; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -476,7 +448,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -487,7 +459,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -500,7 +472,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -521,7 +493,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -531,7 +503,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v0, v0, s[0:1] @@ -539,7 +511,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_p0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -549,7 +521,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] @@ -564,7 +536,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -574,7 +546,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -583,7 +555,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_n0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -593,7 +565,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -608,7 +580,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -618,7 +590,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -627,7 +599,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_p1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3c00 @@ -637,7 +609,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -652,7 +624,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -662,7 +634,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffbc00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -671,7 +643,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_n1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0xbc00 @@ -681,7 +653,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -696,7 +668,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_literal_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -706,7 +678,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -715,7 +687,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; CI-LABEL: test_fold_canonicalize_literal_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x4c00 @@ -725,7 +697,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -740,7 +712,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -750,7 +722,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -759,7 +731,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff @@ -769,7 +741,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -784,7 +756,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -794,7 +766,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -803,7 +775,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff @@ -813,7 +785,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -828,7 +800,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -838,7 +810,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -847,7 +819,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff @@ -857,7 +829,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -872,7 +844,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -882,7 +854,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -891,7 +863,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff @@ -901,7 +873,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -916,7 +888,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -926,7 +898,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -935,7 +907,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_qnan_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7c00 @@ -945,7 +917,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -960,7 +932,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -970,7 +942,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -979,7 +951,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -989,7 +961,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1004,7 +976,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1014,7 +986,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1023,7 +995,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1033,7 +1005,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1048,7 +1020,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan0_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1058,7 +1030,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1067,7 +1039,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan0_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1077,7 +1049,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1092,7 +1064,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan1_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1102,7 +1074,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1111,7 +1083,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan1_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1121,7 +1093,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1136,7 +1108,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan2_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1146,7 +1118,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1155,7 +1127,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan2_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1165,7 +1137,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1180,7 +1152,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan3_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1190,7 +1162,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1199,7 +1171,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan3_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1209,7 +1181,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1224,7 +1196,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1242,7 +1214,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_test_canonicalize_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1254,7 +1226,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; CI-LABEL: v_test_canonicalize_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1277,8 +1249,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_test_canonicalize_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1298,7 +1272,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fabs_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1316,7 +1290,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1329,7 +1303,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; CI-LABEL: v_test_canonicalize_fabs_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1352,13 +1326,14 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1376,7 +1351,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1394,7 +1369,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1407,7 +1382,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1431,13 +1406,14 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1456,7 +1432,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1474,7 +1450,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1486,7 +1462,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; CI-LABEL: v_test_canonicalize_fneg_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1510,8 +1486,10 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1532,12 +1510,12 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 { ; VI-LABEL: s_test_canonicalize_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_max_f16_e64 v0, s2, s2 +; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_max_f16_e64 v0, s4, s4 ; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1547,39 +1525,40 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_test_canonicalize_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; CI-LABEL: s_test_canonicalize_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dword s0, s[2:3], 0xb ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s3, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_canonicalize_var_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v1, s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1593,7 +1572,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1603,7 +1582,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_p0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -1611,7 +1590,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_p0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -1621,7 +1600,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_p0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1636,7 +1615,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x80008000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1646,7 +1625,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_n0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x80008000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1655,7 +1634,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_n0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x80008000 @@ -1665,7 +1644,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_n0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1680,7 +1659,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c003c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1690,7 +1669,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_p1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c003c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1699,7 +1678,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_p1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3c003c00 @@ -1709,7 +1688,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_p1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1724,7 +1703,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xbc00bc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1734,7 +1713,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_n1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00bc00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1743,7 +1722,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_n1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 @@ -1753,7 +1732,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_n1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1768,7 +1747,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_literal_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4c004c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1778,7 +1757,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; GFX9-LABEL: test_fold_canonicalize_literal_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c004c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1787,7 +1766,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; CI-LABEL: test_fold_canonicalize_literal_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x4c004c00 @@ -1797,7 +1776,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; GFX11-LABEL: test_fold_canonicalize_literal_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1812,7 +1791,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1822,7 +1801,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1831,7 +1810,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff @@ -1841,7 +1820,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1856,7 +1835,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1866,7 +1845,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1875,7 +1854,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff @@ -1885,7 +1864,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1900,7 +1879,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1910,7 +1889,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1919,7 +1898,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff @@ -1929,7 +1908,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1944,7 +1923,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1954,7 +1933,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1963,7 +1942,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff @@ -1973,7 +1952,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1988,7 +1967,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7c007c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1998,7 +1977,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c007c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2007,7 +1986,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; CI-LABEL: test_fold_canonicalize_qnan_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7c007c00 @@ -2017,7 +1996,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2032,7 +2011,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2042,7 +2021,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2051,7 +2030,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2061,7 +2040,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2076,7 +2055,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2086,7 +2065,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2095,7 +2074,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2105,7 +2084,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2120,7 +2099,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2130,7 +2109,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2139,7 +2118,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2149,7 +2128,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2164,7 +2143,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2174,7 +2153,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2183,7 +2162,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2193,7 +2172,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2208,7 +2187,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2218,7 +2197,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2227,7 +2206,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2237,7 +2216,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2252,7 +2231,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2262,7 +2241,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2271,7 +2250,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2281,7 +2260,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2376,7 +2355,7 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: s_test_canonicalize_undef_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2386,7 +2365,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: s_test_canonicalize_undef_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -2394,7 +2373,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: s_test_canonicalize_undef_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -2404,7 +2383,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: s_test_canonicalize_undef_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -2678,7 +2657,7 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 { define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: s_test_canonicalize_undef_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2689,7 +2668,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; GFX9-LABEL: s_test_canonicalize_undef_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2698,7 +2677,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; CI-LABEL: s_test_canonicalize_undef_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 @@ -2709,7 +2688,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; GFX11-LABEL: s_test_canonicalize_undef_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index d53c0411ad88c..f0ce96af90649 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -23,7 +23,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -35,7 +35,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -46,7 +46,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -59,7 +59,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -78,8 +78,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, float %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s2, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -89,8 +89,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX8-LABEL: s_test_canonicalize_var_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX9-LABEL: s_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -111,11 +111,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX11-LABEL: s_test_canonicalize_var_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, s2, s2 +; GFX11-NEXT: v_max_f32_e64 v1, s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -123,7 +123,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX12-LABEL: s_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -162,7 +162,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -175,7 +175,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -195,7 +195,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -207,7 +207,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -218,7 +218,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -252,7 +252,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -264,7 +264,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -275,7 +275,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -288,7 +288,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -308,7 +308,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_undef_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -318,7 +318,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: test_fold_canonicalize_undef_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -326,7 +326,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_fold_canonicalize_undef_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -336,7 +336,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: test_fold_canonicalize_undef_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -351,7 +351,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -361,7 +361,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -369,7 +369,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -379,7 +379,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -394,7 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -404,7 +404,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -413,7 +413,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -424,7 +424,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -440,7 +440,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -459,7 +459,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -469,7 +469,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -484,7 +484,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -494,7 +494,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, -1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -503,7 +503,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -513,7 +513,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -528,7 +528,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -538,7 +538,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -547,7 +547,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -557,7 +557,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -572,7 +572,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -582,7 +582,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -590,7 +590,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -600,7 +600,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -615,7 +615,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic(ptr addrspace(1) %out) #5 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -626,7 +626,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -636,7 +636,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -647,7 +647,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -663,7 +663,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out(ptr addrspace(1) %out) #6 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -674,7 +674,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -695,7 +695,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -711,7 +711,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in(ptr addrspace(1) %out) #7 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -722,7 +722,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -732,7 +732,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -743,7 +743,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -759,7 +759,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -769,7 +769,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -778,7 +778,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -788,7 +788,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -803,7 +803,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -813,7 +813,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -822,7 +822,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -833,7 +833,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -859,7 +859,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x807fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -878,7 +878,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -893,7 +893,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -903,7 +903,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -912,7 +912,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -922,7 +922,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -937,7 +937,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -947,7 +947,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -966,7 +966,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -981,7 +981,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -991,7 +991,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1010,7 +1010,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1069,7 +1069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1079,7 +1079,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1113,7 +1113,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1142,7 +1142,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1176,7 +1176,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1186,7 +1186,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1201,7 +1201,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1224,7 +1224,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1237,7 +1237,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, double %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX8-LABEL: s_test_canonicalize_var_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1276,7 +1276,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX9-LABEL: s_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX11-LABEL: s_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1296,7 +1296,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX12-LABEL: s_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], s[2:3], s[2:3] @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1324,7 +1324,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1368,7 +1368,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1380,7 +1380,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1391,7 +1391,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1425,7 +1425,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1437,7 +1437,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1448,7 +1448,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1492,7 +1492,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1501,7 +1501,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,7 +1541,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1550,7 +1550,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1577,7 +1577,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1588,7 +1588,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1597,7 +1597,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1607,7 +1607,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1622,7 +1622,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1642,7 +1642,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1652,7 +1652,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1667,7 +1667,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1678,7 +1678,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1687,7 +1687,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1712,7 +1712,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1723,7 +1723,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1744,7 +1744,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1761,7 +1761,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1772,7 +1772,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xfffff @@ -1782,7 +1782,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1793,7 +1793,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1809,7 +1809,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1820,7 +1820,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1840,7 +1840,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1856,7 +1856,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1867,7 +1867,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x800fffff @@ -1877,7 +1877,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1888,7 +1888,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1904,7 +1904,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1915,7 +1915,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1934,7 +1934,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1949,7 +1949,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1960,7 +1960,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1969,7 +1969,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1979,7 +1979,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2005,7 +2005,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2014,7 +2014,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2024,7 +2024,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2039,7 +2039,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2050,7 +2050,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2069,7 +2069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2095,7 +2095,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2114,7 +2114,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2129,7 +2129,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2140,7 +2140,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2159,7 +2159,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2174,7 +2174,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2185,7 +2185,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2194,7 +2194,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2204,7 +2204,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2219,7 +2219,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f64_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2236,7 +2236,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f64_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f64_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2264,7 +2264,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f64_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2277,7 +2279,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f64_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2299,7 +2303,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f32_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2316,7 +2320,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f32_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2333,7 +2337,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f32_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2344,7 +2348,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f32_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2357,7 +2363,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f32_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2379,7 +2387,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2397,7 +2405,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2414,7 +2422,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2425,7 +2433,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2438,7 +2448,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2461,7 +2473,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_v2f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2484,7 +2496,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2504,7 +2516,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2515,7 +2527,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2528,7 +2542,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2550,7 +2566,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f64_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2567,7 +2583,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f64_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2584,7 +2600,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f64_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2595,7 +2611,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f64_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2608,7 +2626,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f64_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2630,7 +2650,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f32_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2647,7 +2667,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f32_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2664,7 +2684,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f32_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2675,7 +2695,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f32_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2688,7 +2710,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f32_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2711,7 +2735,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2729,7 +2753,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2746,7 +2770,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2757,7 +2781,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2770,7 +2796,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2794,7 +2822,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2817,7 +2845,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2836,7 +2864,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2847,7 +2875,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2860,7 +2890,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2882,7 +2914,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) #1 { ; GFX6-LABEL: v_test_canonicalize_var_v2f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2899,7 +2931,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_test_canonicalize_var_v2f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2916,7 +2948,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_test_canonicalize_var_v2f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2929,9 +2961,11 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_test_canonicalize_var_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2944,9 +2978,11 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_test_canonicalize_var_v2f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 7d8f43bbe16b7..845b25a8f61bd 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @fcmp_f16_lt( ; SI-LABEL: fcmp_f16_lt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -32,33 +32,33 @@ define amdgpu_kernel void @fcmp_f16_lt( ; ; VI-LABEL: fcmp_f16_lt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -95,8 +95,8 @@ entry: define amdgpu_kernel void @fcmp_f16_lt_abs( ; SI-LABEL: fcmp_f16_lt_abs: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -121,33 +121,33 @@ define amdgpu_kernel void @fcmp_f16_lt_abs( ; ; VI-LABEL: fcmp_f16_lt_abs: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: v_cmp_lt_f16_e64 s[4:5], |v0|, |v1| -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], |v0|, |v1| +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lt_abs: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -187,8 +187,8 @@ entry: define amdgpu_kernel void @fcmp_f16_eq( ; SI-LABEL: fcmp_f16_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -213,33 +213,33 @@ define amdgpu_kernel void @fcmp_f16_eq( ; ; VI-LABEL: fcmp_f16_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_eq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -276,8 +276,8 @@ entry: define amdgpu_kernel void @fcmp_f16_le( ; SI-LABEL: fcmp_f16_le: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -302,33 +302,33 @@ define amdgpu_kernel void @fcmp_f16_le( ; ; VI-LABEL: fcmp_f16_le: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_le_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_le: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -365,8 +365,8 @@ entry: define amdgpu_kernel void @fcmp_f16_gt( ; SI-LABEL: fcmp_f16_gt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -391,33 +391,33 @@ define amdgpu_kernel void @fcmp_f16_gt( ; ; VI-LABEL: fcmp_f16_gt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_gt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -454,8 +454,8 @@ entry: define amdgpu_kernel void @fcmp_f16_lg( ; SI-LABEL: fcmp_f16_lg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -480,33 +480,33 @@ define amdgpu_kernel void @fcmp_f16_lg( ; ; VI-LABEL: fcmp_f16_lg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -543,8 +543,8 @@ entry: define amdgpu_kernel void @fcmp_f16_ge( ; SI-LABEL: fcmp_f16_ge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -569,33 +569,33 @@ define amdgpu_kernel void @fcmp_f16_ge( ; ; VI-LABEL: fcmp_f16_ge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_ge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -632,8 +632,8 @@ entry: define amdgpu_kernel void @fcmp_f16_o( ; SI-LABEL: fcmp_f16_o: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -658,33 +658,33 @@ define amdgpu_kernel void @fcmp_f16_o( ; ; VI-LABEL: fcmp_f16_o: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_o: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -721,8 +721,8 @@ entry: define amdgpu_kernel void @fcmp_f16_u( ; SI-LABEL: fcmp_f16_u: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -747,33 +747,33 @@ define amdgpu_kernel void @fcmp_f16_u( ; ; VI-LABEL: fcmp_f16_u: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_u_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_u: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -810,8 +810,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nge( ; SI-LABEL: fcmp_f16_nge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -836,33 +836,33 @@ define amdgpu_kernel void @fcmp_f16_nge( ; ; VI-LABEL: fcmp_f16_nge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -899,8 +899,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nlg( ; SI-LABEL: fcmp_f16_nlg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -925,33 +925,33 @@ define amdgpu_kernel void @fcmp_f16_nlg( ; ; VI-LABEL: fcmp_f16_nlg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nlg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -988,8 +988,8 @@ entry: define amdgpu_kernel void @fcmp_f16_ngt( ; SI-LABEL: fcmp_f16_ngt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1014,33 +1014,33 @@ define amdgpu_kernel void @fcmp_f16_ngt( ; ; VI-LABEL: fcmp_f16_ngt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_ngt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1077,8 +1077,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nle( ; SI-LABEL: fcmp_f16_nle: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1103,33 +1103,33 @@ define amdgpu_kernel void @fcmp_f16_nle( ; ; VI-LABEL: fcmp_f16_nle: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nle: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1166,8 +1166,8 @@ entry: define amdgpu_kernel void @fcmp_f16_neq( ; SI-LABEL: fcmp_f16_neq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1192,33 +1192,33 @@ define amdgpu_kernel void @fcmp_f16_neq( ; ; VI-LABEL: fcmp_f16_neq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_neq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1255,8 +1255,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nlt( ; SI-LABEL: fcmp_f16_nlt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1281,33 +1281,33 @@ define amdgpu_kernel void @fcmp_f16_nlt( ; ; VI-LABEL: fcmp_f16_nlt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nlt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1344,8 +1344,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_lt( ; SI-LABEL: fcmp_v2f16_lt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1376,21 +1376,21 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; ; VI-LABEL: fcmp_v2f16_lt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1399,14 +1399,14 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_lt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1449,8 +1449,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_eq( ; SI-LABEL: fcmp_v2f16_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1481,21 +1481,21 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; ; VI-LABEL: fcmp_v2f16_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1504,14 +1504,14 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_eq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1553,8 +1553,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_le( ; SI-LABEL: fcmp_v2f16_le: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1585,21 +1585,21 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; ; VI-LABEL: fcmp_v2f16_le: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1608,14 +1608,14 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_le_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_le: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1657,8 +1657,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_gt( ; SI-LABEL: fcmp_v2f16_gt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1689,21 +1689,21 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; ; VI-LABEL: fcmp_v2f16_gt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1712,14 +1712,14 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_gt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1762,8 +1762,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_lg( ; SI-LABEL: fcmp_v2f16_lg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1794,21 +1794,21 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; ; VI-LABEL: fcmp_v2f16_lg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1817,14 +1817,14 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_lg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1867,8 +1867,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_ge( ; SI-LABEL: fcmp_v2f16_ge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1899,21 +1899,21 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; ; VI-LABEL: fcmp_v2f16_ge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1922,14 +1922,14 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_ge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1972,8 +1972,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_o( ; SI-LABEL: fcmp_v2f16_o: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2004,21 +2004,21 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; ; VI-LABEL: fcmp_v2f16_o: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2027,14 +2027,14 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_o: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2077,8 +2077,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_u( ; SI-LABEL: fcmp_v2f16_u: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2109,21 +2109,21 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; ; VI-LABEL: fcmp_v2f16_u: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2132,14 +2132,14 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_u_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_u: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2181,8 +2181,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nge( ; SI-LABEL: fcmp_v2f16_nge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2213,21 +2213,21 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; ; VI-LABEL: fcmp_v2f16_nge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2236,14 +2236,14 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2285,8 +2285,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nlg( ; SI-LABEL: fcmp_v2f16_nlg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2317,21 +2317,21 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; ; VI-LABEL: fcmp_v2f16_nlg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2340,14 +2340,14 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nlg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2390,8 +2390,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_ngt( ; SI-LABEL: fcmp_v2f16_ngt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2422,21 +2422,21 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; ; VI-LABEL: fcmp_v2f16_ngt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2445,14 +2445,14 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_ngt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2494,8 +2494,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nle( ; SI-LABEL: fcmp_v2f16_nle: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2526,21 +2526,21 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; ; VI-LABEL: fcmp_v2f16_nle: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2549,14 +2549,14 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nle: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2598,8 +2598,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_neq( ; SI-LABEL: fcmp_v2f16_neq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2630,21 +2630,21 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; ; VI-LABEL: fcmp_v2f16_neq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2653,14 +2653,14 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_neq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2702,8 +2702,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nlt( ; SI-LABEL: fcmp_v2f16_nlt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2734,21 +2734,21 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; ; VI-LABEL: fcmp_v2f16_nlt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2757,14 +2757,14 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nlt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index eda1709e4fd59..bd483f4c07071 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -15,30 +15,31 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) { ; SI-LABEL: s_copysign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_copysign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_movk_i32 s3, 0x7fff +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_bfi_b32 v2, s3, v0, v1 +; VI-NEXT: s_lshr_b32 s3, s4, 16 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -46,29 +47,29 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, ; ; GFX9-LABEL: s_copysign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -81,8 +82,8 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -93,10 +94,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; VI-LABEL: s_test_copysign_f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x7fff +; VI-NEXT: s_and_b32 s2, s4, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -105,22 +106,22 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -135,8 +136,8 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -147,10 +148,10 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; VI-LABEL: s_test_copysign_f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x7fff +; VI-NEXT: s_and_b32 s2, s4, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -159,22 +160,22 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -189,8 +190,8 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -201,10 +202,10 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x7fff +; VI-NEXT: s_and_b32 s2, s4, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -213,22 +214,22 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_10.0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -243,8 +244,8 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -255,10 +256,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 15 +; VI-NEXT: s_or_b32 s2, s4, 0x8000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -267,22 +268,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s0, s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -297,8 +298,8 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -309,10 +310,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 15 +; VI-NEXT: s_or_b32 s2, s4, 0x8000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -321,22 +322,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s0, s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -351,25 +352,26 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s2, v0 +; VI-NEXT: v_and_b32_e32 v2, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -377,23 +379,23 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_0_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2 +; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -407,25 +409,26 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -434,24 +437,24 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -466,26 +469,27 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: v_mov_b32_e32 v1, 0x41200000 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -494,24 +498,24 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; ; GFX9-LABEL: s_test_copysign_f16_10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -526,25 +530,26 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, -1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -553,24 +558,24 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; ; GFX9-LABEL: s_test_copysign_f16_neg1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -585,26 +590,27 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: v_mov_b32_e32 v1, 0xc1200000 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_and_b32_e32 v0, s4, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -613,24 +619,24 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_test_copysign_f16_neg10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -823,8 +829,8 @@ define half @v_test_copysign_f16_neg10(half %mag) { define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -849,8 +855,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -874,17 +880,17 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] @@ -893,8 +899,10 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -923,8 +931,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -949,8 +957,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -974,15 +982,15 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v1, s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 @@ -993,9 +1001,12 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v1, s[6:7] @@ -1024,8 +1035,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1049,8 +1060,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1074,14 +1085,14 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_brev_b32 s0, -2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v1, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1093,8 +1104,10 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1123,8 +1136,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1150,8 +1163,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1175,14 +1188,14 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_brev_b32 s0, -2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v1, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v1, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -1194,10 +1207,12 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v1, s[4:5] ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -1224,8 +1239,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1251,8 +1266,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1276,14 +1291,14 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v1, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v1, s[0:1] +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -1295,8 +1310,10 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1325,35 +1342,35 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -1375,15 +1392,15 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s0, v2, v1 @@ -1393,10 +1410,12 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1423,8 +1442,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1452,8 +1471,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1477,17 +1496,17 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v1, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_short v2, v0, s[4:5] @@ -1496,8 +1515,10 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1526,8 +1547,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) %arg_out, double %mag, half %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: s_lshr_b32 s4, s3, 8 @@ -1590,8 +1611,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s0, s7, 8 ; VI-NEXT: s_and_b32 s1, s7, 0x1ff @@ -1648,8 +1669,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s7, 8 @@ -1706,8 +1727,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff ; GFX11-NEXT: s_lshr_b32 s2, s7, 8 @@ -1777,7 +1798,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) { ; SI-LABEL: s_copysign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1801,7 +1822,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; VI-LABEL: s_copysign_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_movk_i32 s4, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1821,7 +1842,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX9-LABEL: s_copysign_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1840,7 +1861,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX11-LABEL: s_copysign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s3 @@ -1866,8 +1887,8 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) { ; SI-LABEL: s_copysign_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s4, 16 @@ -1894,8 +1915,8 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; ; VI-LABEL: s_copysign_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -1923,33 +1944,33 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; ; GFX9-LABEL: s_copysign_v3f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_lshr_b32 s1, s6, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 -; GFX9-NEXT: global_store_short v0, v2, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 +; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s2, s6, 16 @@ -1978,8 +1999,8 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) { ; SI-LABEL: s_copysign_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2013,8 +2034,8 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; VI-LABEL: s_copysign_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 @@ -2044,39 +2065,39 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; GFX9-LABEL: s_copysign_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s1, s7, 16 +; GFX9-NEXT: s_lshr_b32 s3, s7, 16 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: s_lshr_b32 s1, s6, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v3 +; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_bfi_b32 v3, s0, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_bfi_b32 v3, s2, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: v_mov_b32_e32 v1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index f48961c905f58..542d67486e758 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) { ; SI-LABEL: s_test_copysign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; VI-LABEL: s_test_copysign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -34,7 +34,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; GFX11-LABEL: s_test_copysign_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -51,8 +51,8 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -63,10 +63,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; ; VI-LABEL: s_test_copysign_f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -76,10 +76,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s2, 31 +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -94,8 +94,8 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -106,10 +106,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; ; VI-LABEL: s_test_copysign_f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -119,10 +119,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s2, 31 +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -137,8 +137,8 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,10 +149,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; ; VI-LABEL: s_test_copysign_f32_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -162,10 +162,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s2, 31 +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -180,8 +180,8 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -192,10 +192,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; ; VI-LABEL: s_test_copysign_f32_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: s_or_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -205,10 +205,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 31 +; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -223,8 +223,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,10 +235,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: s_or_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -248,10 +248,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 31 +; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -266,8 +266,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -278,10 +278,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x80000000 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -291,10 +291,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -310,8 +310,8 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -323,10 +323,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x80000000 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -337,10 +337,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -356,8 +356,8 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -369,10 +369,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; ; VI-LABEL: s_test_copysign_f32_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x80000000 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -383,10 +383,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; GFX11-LABEL: s_test_copysign_f32_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -402,8 +402,8 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -415,10 +415,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; ; VI-LABEL: s_test_copysign_f32_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x80000000 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -429,10 +429,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; GFX11-LABEL: s_test_copysign_f32_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -448,8 +448,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -461,10 +461,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0x80000000 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -475,10 +475,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; GFX11-LABEL: s_test_copysign_f32_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -494,8 +494,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) { ; SI-LABEL: s_test_copysign_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -511,8 +511,8 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: s_test_copysign_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 @@ -529,8 +529,8 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; GFX11-LABEL: s_test_copysign_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 @@ -549,40 +549,40 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x float> %mag, <3 x float> %sign) { ; SI-LABEL: s_test_copysign_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s7, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_bfi_b32 v0, s7, v0, v2 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_bfi_b32 v2, s7, v2, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_bfi_b32 v2, s0, v2, v3 +; SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_brev_b32 s7, -2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 +; VI-NEXT: v_bfi_b32 v2, s7, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_bfi_b32 v1, s2, v3, v0 +; VI-NEXT: v_bfi_b32 v1, s7, v3, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v3, s8 -; VI-NEXT: v_bfi_b32 v0, s2, v0, v3 +; VI-NEXT: v_bfi_b32 v0, s7, v0, v3 ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] @@ -591,8 +591,8 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo ; GFX11-LABEL: s_test_copysign_v3f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9 @@ -614,45 +614,45 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x float> %mag, <4 x float> %sign) { ; SI-LABEL: s_test_copysign_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_brev_b32 s12, -2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_bfi_b32 v2, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v2, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_bfi_b32 v0, s12, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_brev_b32 s12, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s12, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_bfi_b32 v2, s2, v2, v0 +; VI-NEXT: v_bfi_b32 v2, s12, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 +; VI-NEXT: v_bfi_b32 v1, s12, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_bfi_b32 v0, s2, v0, v4 +; VI-NEXT: v_bfi_b32 v0, s12, v0, v4 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -661,8 +661,8 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo ; GFX11-LABEL: s_test_copysign_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10 @@ -906,46 +906,46 @@ define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) { define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out, float %mag, double %sign) { ; SI-LABEL: s_test_copysign_f32_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_brev_b32 s0, -2 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_bfi_b32 v2, s0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -958,7 +958,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -972,7 +972,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s3, 0x80000000 @@ -984,7 +984,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1003,7 +1003,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, float %mag, half %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1018,7 +1018,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1050,23 +1050,24 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f32_1_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: v_or_b32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_1_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_lshl_b32 s2, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1078,10 +1079,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out ; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 @@ -1100,7 +1101,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, float %mag, bfloat %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1116,7 +1117,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1129,7 +1130,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index b5fa3fd9eccc1..4300faa02742a 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -11,49 +11,49 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) #0 define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) { ; SI-LABEL: s_test_copysign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x1d ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_brev_b32 s4, -2 -; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_bfi_b32 v1, s4, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x74 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x74 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -65,8 +65,8 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -78,28 +78,28 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 ; ; VI-LABEL: s_test_copysign_f64_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s3, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bitset0_b32 s1, 31 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s3, 31 +; GFX11-NEXT: s_bitset0_b32 s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -111,8 +111,8 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,28 +124,28 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 ; ; VI-LABEL: s_test_copysign_f64_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s3, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bitset0_b32 s1, 31 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s3, 31 +; GFX11-NEXT: s_bitset0_b32 s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -157,8 +157,8 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -170,28 +170,28 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: s_test_copysign_f64_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s3, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bitset0_b32 s1, 31 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s3, 31 +; GFX11-NEXT: s_bitset0_b32 s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -203,8 +203,8 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -216,28 +216,28 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: s_test_copysign_f64_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s3, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bitset1_b32 s1, 31 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s3, 31 +; GFX11-NEXT: s_bitset1_b32 s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -249,8 +249,8 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -262,28 +262,28 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: s_test_copysign_f64_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s3, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bitset1_b32 s1, 31 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s3, 31 +; GFX11-NEXT: s_bitset1_b32 s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -295,49 +295,49 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) { ; SI-LABEL: s_test_copysign_f64_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dword s0, s[0:1], 0x1d -; SI-NEXT: s_brev_b32 s1, -2 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dword s6, s[2:3], 0x1d +; SI-NEXT: s_brev_b32 s7, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_bfi_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dword s4, s[0:1], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dword s4, s[2:3], 0x74 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_brev_b32 s5, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_bfi_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -350,49 +350,49 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], half %sign) { ; SI-LABEL: s_test_copysign_f64_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0x1d -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s6, s[2:3], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_bfi_b32 v1, s2, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfi_b32 v1, s6, v1, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x74 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_brev_b32 s5, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_bfi_b32 v1, s5, v1, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -405,7 +405,7 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -419,7 +419,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub ; ; VI-LABEL: s_test_copysign_f64_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -431,7 +431,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: s_test_copysign_f64_0_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -448,7 +448,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -463,7 +463,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub ; ; VI-LABEL: s_test_copysign_f64_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -476,7 +476,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: s_test_copysign_f64_1_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -494,7 +494,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -509,7 +509,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou ; ; VI-LABEL: s_test_copysign_f64_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -522,7 +522,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou ; ; GFX11-LABEL: s_test_copysign_f64_10_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -540,7 +540,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -555,7 +555,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d ; ; VI-LABEL: s_test_copysign_f64_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -568,7 +568,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d ; ; GFX11-LABEL: s_test_copysign_f64_neg1_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -586,7 +586,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -601,7 +601,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f64_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -614,7 +614,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f64_neg10_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -632,36 +632,36 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) { ; SI-LABEL: s_test_copysign_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s8, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_bfi_b32 v3, s8, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s8, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_brev_b32 s8, -2 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s8, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 +; VI-NEXT: v_bfi_b32 v1, s8, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -671,8 +671,8 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou ; GFX11-LABEL: s_test_copysign_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s11 ; GFX11-NEXT: v_mov_b32_e32 v2, s9 @@ -693,46 +693,46 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) { ; SI-LABEL: s_test_copysign_v3f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: s_mov_b32 s22, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s10, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s10, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s10, v0, v2 +; SI-NEXT: v_bfi_b32 v5, s0, v0, v2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[20:23], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_brev_b32 s10, -2 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s15 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s10, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_bfi_b32 v1, s10, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v2, s17 -; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: v_bfi_b32 v5, s10, v0, v2 ; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] @@ -746,8 +746,8 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou ; GFX11-LABEL: s_test_copysign_v3f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s15 ; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s4 @@ -771,54 +771,53 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) { ; SI-LABEL: s_test_copysign_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: s_mov_b32 s22, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s12, -2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v2, s19 -; SI-NEXT: v_bfi_b32 v7, s12, v0, v2 +; SI-NEXT: v_bfi_b32 v7, s0, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s12, v0, v2 +; SI-NEXT: v_bfi_b32 v5, s0, v0, v2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_brev_b32 s12, -2 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s15 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s12, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_bfi_b32 v1, s12, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v2, s19 -; VI-NEXT: v_bfi_b32 v7, s2, v0, v2 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_bfi_b32 v7, s12, v0, v2 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v2, s17 -; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_bfi_b32 v5, s12, v0, v2 ; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v6, s10 ; VI-NEXT: v_mov_b32_e32 v8, s2 @@ -833,8 +832,8 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou ; GFX11-LABEL: s_test_copysign_v4f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s15 ; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index b14b6421f56b4..f53d3cf33c9cc 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -10,20 +10,20 @@ define amdgpu_kernel void @v_fdiv_f16( ; SI-LABEL: v_fdiv_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, v2 @@ -46,8 +46,8 @@ define amdgpu_kernel void @v_fdiv_f16( ; ; GFX8-LABEL: v_fdiv_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -74,13 +74,13 @@ define amdgpu_kernel void @v_fdiv_f16( ; ; GFX9-LABEL: v_fdiv_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 @@ -92,13 +92,13 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX10-LABEL: v_fdiv_f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 @@ -110,8 +110,10 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX11-LABEL: v_fdiv_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc @@ -147,7 +149,7 @@ entry: define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -178,7 +180,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX8-LABEL: v_rcp_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -195,7 +197,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX9-LABEL: v_rcp_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -206,7 +208,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX10-LABEL: v_rcp_f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -217,7 +219,9 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX11-LABEL: v_rcp_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -241,7 +245,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_abs: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -272,7 +276,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_abs: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -289,7 +293,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_abs: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -300,7 +304,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rcp_f16_abs: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -311,7 +315,9 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rcp_f16_abs: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -338,7 +344,7 @@ entry: define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: reciprocal_f16_rounded: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -369,7 +375,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX8-LABEL: reciprocal_f16_rounded: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -386,7 +392,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX9-LABEL: reciprocal_f16_rounded: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -397,7 +403,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX10-LABEL: reciprocal_f16_rounded: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -408,7 +414,9 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX11-LABEL: reciprocal_f16_rounded: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -432,7 +440,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_afn: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -450,7 +458,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_afn: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -467,7 +475,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_afn: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -478,7 +486,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rcp_f16_afn: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -489,7 +497,9 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rcp_f16_afn: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -513,7 +523,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_neg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -544,7 +554,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_neg: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -561,7 +571,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_neg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -572,7 +582,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rcp_f16_neg: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -583,7 +593,9 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rcp_f16_neg: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -607,7 +619,7 @@ entry: define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -641,7 +653,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX8-LABEL: v_rsq_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -658,7 +670,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX9-LABEL: v_rsq_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -669,7 +681,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX10-LABEL: v_rsq_f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -680,7 +692,9 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX11-LABEL: v_rsq_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -705,7 +719,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_neg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -739,7 +753,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rsq_f16_neg: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -757,7 +771,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rsq_f16_neg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -769,7 +783,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rsq_f16_neg: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -781,7 +795,9 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rsq_f16_neg: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -808,7 +824,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_multi_use: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -844,7 +860,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX8-LABEL: v_rsq_f16_multi_use: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -863,7 +879,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX9-LABEL: v_rsq_f16_multi_use: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -876,7 +892,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX10-LABEL: v_rsq_f16_multi_use: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -889,7 +905,9 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX11-LABEL: v_rsq_f16_multi_use: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -917,7 +935,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_missing_contract0: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -951,7 +969,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX8-LABEL: v_rsq_f16_missing_contract0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -969,7 +987,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX9-LABEL: v_rsq_f16_missing_contract0: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -981,7 +999,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX10-LABEL: v_rsq_f16_missing_contract0: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -993,7 +1011,9 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX11-LABEL: v_rsq_f16_missing_contract0: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1020,7 +1040,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_missing_contract1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1054,7 +1074,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX8-LABEL: v_rsq_f16_missing_contract1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1072,7 +1092,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX9-LABEL: v_rsq_f16_missing_contract1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -1084,7 +1104,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX10-LABEL: v_rsq_f16_missing_contract1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1096,7 +1116,9 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX11-LABEL: v_rsq_f16_missing_contract1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1123,7 +1145,7 @@ entry: define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_neg_rsq_f16_missing_contract1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1157,7 +1179,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX8-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1175,7 +1197,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX9-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -1187,7 +1209,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX10-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1199,7 +1221,9 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX11-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1226,20 +1250,20 @@ entry: define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_fdiv_f16_afn: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_rcp_f32_e32 v3, v3 @@ -1250,8 +1274,8 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; ; GFX8-LABEL: v_fdiv_f16_afn: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1274,13 +1298,13 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; ; GFX9-LABEL: v_fdiv_f16_afn: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v2, v2 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1290,13 +1314,13 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; GFX10-LABEL: v_fdiv_f16_afn: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v2, v2 ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1306,8 +1330,10 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; GFX11-LABEL: v_fdiv_f16_afn: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc @@ -1337,20 +1363,20 @@ entry: define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #2 { ; SI-LABEL: v_fdiv_f16_unsafe: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[10:11], s[2:3] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_rcp_f32_e32 v3, v3 @@ -1361,8 +1387,8 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; ; GFX8-LABEL: v_fdiv_f16_unsafe: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1385,13 +1411,13 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; ; GFX9-LABEL: v_fdiv_f16_unsafe: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v2, v2 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1401,13 +1427,13 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; GFX10-LABEL: v_fdiv_f16_unsafe: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v2, v2 ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1417,8 +1443,10 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; GFX11-LABEL: v_fdiv_f16_unsafe: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc @@ -1448,7 +1476,7 @@ entry: define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_2_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1463,7 +1491,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_2_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1475,7 +1503,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_2_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1486,7 +1514,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_2_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1497,7 +1525,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_2_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1515,7 +1543,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_k_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1530,7 +1558,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_k_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1542,7 +1570,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_k_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1553,7 +1581,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_k_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1564,7 +1592,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_k_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1582,7 +1610,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_neg_k_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1597,7 +1625,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_neg_k_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1609,7 +1637,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_neg_k_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0xae66, v0 @@ -1620,7 +1648,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_neg_k_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0xae66, v0 @@ -1631,7 +1659,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_neg_k_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 0468175c5df50..c6b730e3fd5d6 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -42,7 +42,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -68,7 +68,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ninf: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -118,7 +118,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX11-LABEL: s_fdiv_f32_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -181,7 +181,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -205,7 +205,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -229,7 +229,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ieee: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -253,7 +253,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ieee: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ieee: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 @@ -294,7 +294,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX11-LABEL: s_fdiv_f32_ieee: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 @@ -334,7 +334,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_25ulp_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX67-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX67-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX8-LABEL: s_fdiv_25ulp_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX10-LABEL: s_fdiv_25ulp_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| @@ -384,7 +384,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX11-LABEL: s_fdiv_25ulp_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| @@ -420,7 +420,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -446,7 +446,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX7-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -465,7 +465,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_frexp_mant_f32_e32 v1, s3 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1 @@ -482,7 +482,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 @@ -498,7 +498,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, s3 ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 @@ -535,7 +535,7 @@ entry: define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX67-LABEL: s_fdiv_fast_ieee_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -548,7 +548,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_fast_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -559,7 +559,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_fast_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -569,7 +569,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_fast_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -599,7 +599,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -612,7 +612,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -623,7 +623,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -633,7 +633,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -663,7 +663,7 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -676,7 +676,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -687,7 +687,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -697,7 +697,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -727,7 +727,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -753,7 +753,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -779,7 +779,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX7-LABEL: s_fdiv_f32_arcp_daz: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -805,7 +805,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_daz: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -829,7 +829,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_daz: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -850,7 +850,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_arcp_daz: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -892,7 +892,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_arcp_ninf: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -905,7 +905,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -916,7 +916,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -926,7 +926,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_arcp_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -956,8 +956,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -996,11 +996,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s4 @@ -1013,10 +1012,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 @@ -1031,13 +1031,14 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s6, v4 +; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_v2f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1076,11 +1077,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX8-LABEL: s_fdiv_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 +; GFX8-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 @@ -1093,10 +1093,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v5, v2 ; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1108,6 +1109,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_div_fixup_f32 v0, v0, s6, v4 @@ -1116,11 +1118,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX10-LABEL: s_fdiv_v2f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s7, s7, s5 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s5 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -1130,8 +1131,9 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s2, s6, s6, s4 +; GFX10-NEXT: v_div_scale_f32 v2, s0, s6, s6, s4 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s5 ; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4 @@ -1153,8 +1155,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-LABEL: s_fdiv_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s5 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 @@ -1212,8 +1214,8 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_v2f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1226,8 +1228,8 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; ; GFX8-LABEL: s_fdiv_ulp25_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s6 ; GFX8-NEXT: v_rcp_f32_e32 v1, s7 @@ -1241,22 +1243,22 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; GFX10-LABEL: s_fdiv_ulp25_v2f32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, s7 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s6 ; GFX11-NEXT: v_rcp_f32_e32 v1, s7 @@ -1290,8 +1292,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1304,8 +1306,8 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; ; GFX8-LABEL: s_fdiv_v2f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s7 ; GFX8-NEXT: v_rcp_f32_e32 v2, s6 @@ -1319,22 +1321,22 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; GFX10-LABEL: s_fdiv_v2f32_fast_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 ; GFX10-NEXT: v_rcp_f32_e32 v2, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_fast_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: v_rcp_f32_e32 v2, s6 @@ -1368,8 +1370,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_arcp_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1382,8 +1384,8 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; ; GFX8-LABEL: s_fdiv_v2f32_arcp_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s7 ; GFX8-NEXT: v_rcp_f32_e32 v2, s6 @@ -1397,22 +1399,22 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; GFX10-LABEL: s_fdiv_v2f32_arcp_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 ; GFX10-NEXT: v_rcp_f32_e32 v2, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_arcp_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: v_rcp_f32_e32 v2, s6 @@ -1446,7 +1448,7 @@ entry: define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v4f32: ; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 @@ -1517,7 +1519,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v4f32: ; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -1588,7 +1590,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX7-LABEL: s_fdiv_v4f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 @@ -1659,7 +1661,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: s_fdiv_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1730,7 +1732,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: s_fdiv_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1792,7 +1794,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: s_fdiv_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1893,7 +1895,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_fast_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -1912,7 +1914,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_fast_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -1931,7 +1933,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_fast_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1949,7 +1951,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_fast_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2001,7 +2003,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_arcp_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -2020,7 +2022,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_arcp_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -2039,7 +2041,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_arcp_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -2057,7 +2059,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_arcp_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2109,8 +2111,8 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2132,11 +2134,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 @@ -2150,13 +2152,14 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2178,11 +2181,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX8-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 ; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 @@ -2194,6 +2197,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2201,11 +2205,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX10-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -2217,7 +2221,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2225,11 +2229,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -2242,7 +2246,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2267,8 +2271,8 @@ entry: define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2288,11 +2292,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 @@ -2304,13 +2308,14 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2330,11 +2335,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX8-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 ; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 ; GFX8-NEXT: v_fma_f32 v2, v3, v2, v2 @@ -2344,6 +2349,7 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2351,21 +2357,21 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX10-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2373,22 +2379,22 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX11-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index c56b4ae3c34f5..8e43bd890a8fa 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -37,7 +37,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -52,7 +52,7 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -69,7 +69,7 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -102,7 +102,7 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -135,7 +135,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -165,7 +165,7 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -185,7 +185,7 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_minus_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -215,7 +215,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_minus_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -235,7 +235,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -265,7 +265,7 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -286,7 +286,7 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_minus_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -316,7 +316,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg ; ; GCN-FLUSH-LABEL: div_v4_minus_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -337,7 +337,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_c_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -369,7 +369,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_c_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 @@ -401,7 +401,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_c_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -433,7 +433,7 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_c_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 @@ -468,40 +468,40 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) { ; GCN-DENORM-LABEL: div_v_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0 +; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 ; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s4 ; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v3, s4 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s0 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s2 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v4, s0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v4, s2 ; GCN-DENORM-NEXT: v_sub_u32_e32 v2, v2, v4 ; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v1 ; GCN-DENORM-NEXT: v_ldexp_f32 v1, v1, v2 -; GCN-DENORM-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-DENORM-NEXT: global_store_dword v0, v1, s[0:1] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0 +; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 ; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s0, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s2, v0 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s4, v1 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[2:3] +; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[0:1] ; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %div = fdiv float %num, %load, !fpmath !0 @@ -512,7 +512,7 @@ define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) { define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) { ; GCN-LABEL: div_1_by_x_fast: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -529,7 +529,7 @@ define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_fast: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -540,7 +540,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_fast: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -558,7 +558,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; GCN-LABEL: div_1_by_minus_x_fast: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -576,7 +576,7 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_fast: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -587,7 +587,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_fast: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -606,7 +606,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -627,7 +627,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -656,7 +656,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -677,7 +677,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) % ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -706,7 +706,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) % define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_minus_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -727,7 +727,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) % ; ; GCN-FLUSH-LABEL: div_1_by_minus_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -757,7 +757,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) % define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -778,7 +778,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspac ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index b9583a73295e2..431b7d5400f43 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -7,10 +7,10 @@ declare void @extern_func() #0 define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { ; FLAT_SCR_OPT-LABEL: stack_object_addrspacecast_in_kernel_no_calls: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s0, s0, s3 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; FLAT_SCR_OPT-NEXT: s_mov_b64 s[0:1], src_private_base ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, s1 @@ -37,10 +37,10 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { define amdgpu_kernel void @stack_object_in_kernel_no_calls() { ; FLAT_SCR_OPT-LABEL: stack_object_in_kernel_no_calls: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s0, s0, s3 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 0 ; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s0 @@ -110,22 +110,22 @@ define amdgpu_kernel void @kernel_calls_no_stack() { define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_OPT-LABEL: test: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s2, s2, s5 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s3, s3, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0 -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1 +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s0, 0 +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s1, 1 ; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 0 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill +; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 0 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 ; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 -; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr0_sgpr1 +; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[2:3], 0x8 +; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr2_sgpr3 ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART @@ -237,18 +237,18 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; ; FLAT_SCR_ARCH-LABEL: test: ; FLAT_SCR_ARCH: ; %bb.0: -; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0 -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1 +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s0, 0 +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s1, 1 ; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 0 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 0 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 ; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 -; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr0_sgpr1 +; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[2:3], 0x8 +; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr2_sgpr3 ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 0af57c6a97db5..087d38ce7b004 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -14,18 +14,19 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -35,17 +36,18 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 @@ -55,12 +57,13 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc @@ -73,12 +76,13 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc @@ -91,9 +95,9 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -106,10 +110,11 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -137,18 +142,19 @@ bb: define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -158,7 +164,8 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -179,14 +186,15 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -198,14 +206,15 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -217,9 +226,11 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -232,11 +243,12 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -264,18 +276,19 @@ bb: define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -285,7 +298,8 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -306,14 +320,15 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -325,14 +340,15 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -344,9 +360,11 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -359,11 +377,12 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -391,19 +410,20 @@ bb: define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -413,18 +433,19 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 @@ -434,13 +455,15 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc @@ -453,13 +476,15 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc @@ -472,9 +497,9 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -489,12 +514,12 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -522,8 +547,9 @@ bb: define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -543,7 +569,8 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -565,16 +592,17 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -585,16 +613,17 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -606,12 +635,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -623,12 +653,14 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -656,8 +688,9 @@ bb: define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -677,7 +710,8 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -699,16 +733,17 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -719,16 +754,17 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -740,12 +776,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -757,12 +794,14 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -790,19 +829,20 @@ bb: define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -812,18 +852,19 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 +; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 @@ -833,13 +874,15 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc @@ -852,13 +895,15 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc @@ -871,9 +916,9 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -888,12 +933,12 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -921,8 +966,9 @@ bb: define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 @@ -942,7 +988,8 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -964,16 +1011,17 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -984,16 +1032,17 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1005,12 +1054,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -1022,12 +1072,14 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -1055,17 +1107,18 @@ bb: define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, 2 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 ; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v3, off offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:2 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -1075,7 +1128,8 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -1097,12 +1151,14 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0 @@ -1116,16 +1172,17 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1137,12 +1194,13 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -1154,12 +1212,14 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 850be72f06c7d..14d8b71c5167a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -13,13 +13,13 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-LABEL: zero_init_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 ; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -31,10 +31,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX10-LABEL: zero_init_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 @@ -83,18 +83,18 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX9-PAL-LABEL: zero_init_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 @@ -120,15 +120,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 @@ -145,15 +145,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 @@ -374,9 +374,9 @@ define void @zero_init_foo() { define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 @@ -392,11 +392,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s2, s2, s5 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 @@ -412,7 +412,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -428,7 +428,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 @@ -444,15 +444,15 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX9-PAL-NEXT: s_mov_b32 s4, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: s_add_i32 s1, s1, 0 @@ -466,7 +466,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 @@ -482,16 +482,16 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-PAL-LABEL: store_load_sindex_kernel: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX10-PAL-NEXT: s_mov_b32 s4, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX10-PAL-NEXT: s_mov_b32 s10, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 -; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX10-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -507,7 +507,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -523,7 +523,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -707,9 +707,9 @@ bb: define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: v_add_u32_e32 v1, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: scratch_store_dword v1, v2, off @@ -721,10 +721,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-LABEL: store_load_vindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0, v0 @@ -738,7 +738,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -749,7 +750,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -759,17 +761,17 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc @@ -779,6 +781,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX940-LABEL: store_load_vindex_kernel: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -789,15 +792,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-PAL-LABEL: store_load_vindex_kernel: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX10-PAL-NEXT: s_mov_b32 s10, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 0, v0 @@ -811,7 +814,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-PAL-LABEL: store_load_vindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -822,7 +826,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-PAL-LABEL: store_load_vindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -1063,8 +1068,8 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-LABEL: zero_init_small_offset_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1083,10 +1088,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX10-LABEL: zero_init_small_offset_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 @@ -1141,19 +1146,19 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX9-PAL-LABEL: zero_init_small_offset_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -1182,15 +1187,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1209,15 +1214,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 @@ -1468,9 +1473,9 @@ define void @zero_init_small_offset_foo() { define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1488,11 +1493,11 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s2, s2, s5 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -1510,7 +1515,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -1528,7 +1533,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -1546,17 +1551,17 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX9-PAL-NEXT: s_mov_b32 s4, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 +; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 glc -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1571,7 +1576,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -1589,16 +1594,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 -; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1617,16 +1622,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 -; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1644,7 +1649,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1662,7 +1667,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1900,8 +1905,8 @@ bb: define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1917,10 +1922,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off glc dlc @@ -1938,6 +1943,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1950,6 +1957,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -1959,16 +1968,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x100, v0 @@ -1984,6 +1993,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:256 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1994,15 +2004,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 @@ -2018,15 +2028,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off glc dlc @@ -2044,6 +2054,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2056,6 +2068,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 +; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -2246,8 +2260,8 @@ bb: define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-LABEL: zero_init_large_offset_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2267,10 +2281,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX10-LABEL: zero_init_large_offset_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 @@ -2327,19 +2341,19 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX9-PAL-LABEL: zero_init_large_offset_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -2370,15 +2384,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2398,15 +2412,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 @@ -2711,9 +2725,9 @@ define void @zero_init_large_offset_foo() { define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2731,11 +2745,11 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s2, s2, s5 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -2753,7 +2767,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -2771,7 +2785,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -2789,17 +2803,17 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX9-PAL-NEXT: s_mov_b32 s4, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 +; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2814,7 +2828,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -2832,16 +2846,16 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 -; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2860,16 +2874,16 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 -; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2887,7 +2901,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2905,7 +2919,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -3143,8 +3157,8 @@ bb: define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3160,10 +3174,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc @@ -3182,6 +3196,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3194,6 +3210,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3203,16 +3221,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 @@ -3228,6 +3246,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: s_movk_i32 s0, 0x4004 ; GFX940-NEXT: scratch_store_dword v0, v1, s0 sc0 sc1 @@ -3239,15 +3258,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 @@ -3263,15 +3282,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc @@ -3290,6 +3309,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3302,6 +3323,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 +; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -3495,8 +3518,8 @@ bb: define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-LABEL: store_load_large_imm_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 @@ -3512,10 +3535,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s0, s0, s3 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3800 @@ -3553,15 +3576,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 @@ -3588,15 +3611,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 @@ -3612,15 +3635,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 -; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 @@ -3818,10 +3841,10 @@ bb: define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-LABEL: store_load_vidx_sidx_offset: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -3834,11 +3857,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-LABEL: store_load_vidx_sidx_offset: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s2, s2, s5 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_add_u32 s6, s6, s11 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -3851,10 +3874,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, 0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3864,9 +3888,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-LABEL: store_load_vidx_sidx_offset: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3876,16 +3901,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX9-PAL-NEXT: s_mov_b32 s4, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX9-PAL-NEXT: s_mov_b32 s10, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 @@ -3896,7 +3921,8 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 @@ -3910,16 +3936,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] -; GFX10-PAL-NEXT: s_mov_b32 s4, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] +; GFX10-PAL-NEXT: s_mov_b32 s10, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 -; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 +; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX10-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -3932,10 +3958,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3945,9 +3972,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index e44572985e6d2..c9618d43943ef 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -21,14 +21,14 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -36,11 +36,11 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -55,14 +55,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -70,14 +70,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -85,11 +85,11 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -104,14 +104,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -119,14 +119,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -134,11 +134,11 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -155,8 +155,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -173,8 +173,8 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_add_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -191,12 +191,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_add_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -214,18 +214,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -233,18 +233,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_add_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -252,11 +252,11 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_add_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -276,18 +276,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -298,18 +298,18 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -320,11 +320,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -348,12 +348,12 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -361,12 +361,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -374,11 +374,11 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -392,8 +392,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -408,8 +408,8 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -424,12 +424,12 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -446,16 +446,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -463,16 +463,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_add_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -480,11 +480,11 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_add_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -503,16 +503,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -523,16 +523,16 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_add_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -543,11 +543,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_add_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -570,14 +570,14 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -585,14 +585,14 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -600,11 +600,11 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -619,8 +619,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -637,8 +637,8 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_and_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -655,12 +655,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_and_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -678,18 +678,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -697,18 +697,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_and_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -716,11 +716,11 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_and_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -740,18 +740,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -762,18 +762,18 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -784,11 +784,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -812,12 +812,12 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -825,12 +825,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -838,11 +838,11 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -856,8 +856,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -872,8 +872,8 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -888,12 +888,12 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -910,16 +910,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -927,16 +927,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_and_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -944,11 +944,11 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_and_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -967,16 +967,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -987,16 +987,16 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_and_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1007,11 +1007,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_and_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1034,14 +1034,14 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1049,14 +1049,14 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1064,11 +1064,11 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1083,8 +1083,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -1101,8 +1101,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_sub_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -1119,12 +1119,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_sub_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1142,18 +1142,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1161,18 +1161,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_sub_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1180,11 +1180,11 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_sub_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1204,18 +1204,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1226,18 +1226,18 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1248,11 +1248,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1276,12 +1276,12 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1289,12 +1289,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1302,11 +1302,11 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1320,8 +1320,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1336,8 +1336,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1352,12 +1352,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1374,16 +1374,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1391,16 +1391,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_sub_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1408,11 +1408,11 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_sub_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1431,16 +1431,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1451,16 +1451,16 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1471,11 +1471,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1498,39 +1498,39 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1544,8 +1544,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -1562,8 +1562,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -1580,12 +1580,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -1603,47 +1603,47 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1662,18 +1662,18 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -1684,18 +1684,18 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -1706,11 +1706,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1734,35 +1734,35 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1775,8 +1775,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1791,8 +1791,8 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_max_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1807,12 +1807,12 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_max_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -1829,43 +1829,43 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1883,16 +1883,16 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -1903,16 +1903,16 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -1923,11 +1923,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1950,39 +1950,39 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1996,8 +1996,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2014,8 +2014,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2032,12 +2032,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2055,47 +2055,47 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2114,18 +2114,18 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2136,18 +2136,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2158,11 +2158,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2186,35 +2186,35 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2227,8 +2227,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2243,8 +2243,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_umax_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2259,12 +2259,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_umax_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2281,43 +2281,43 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2335,16 +2335,16 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2355,16 +2355,16 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2375,11 +2375,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2402,39 +2402,39 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2448,8 +2448,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2466,8 +2466,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2484,12 +2484,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2507,47 +2507,47 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2566,18 +2566,18 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2588,18 +2588,18 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2610,11 +2610,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2638,35 +2638,35 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2679,8 +2679,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2695,8 +2695,8 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2711,12 +2711,12 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2733,43 +2733,43 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2787,16 +2787,16 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2807,16 +2807,16 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2827,11 +2827,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2854,39 +2854,39 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2900,8 +2900,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2918,8 +2918,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umin_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2936,12 +2936,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umin_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2959,47 +2959,47 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3018,18 +3018,18 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -3040,18 +3040,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -3062,11 +3062,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3090,35 +3090,35 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -3131,8 +3131,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3147,8 +3147,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_umin_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3163,12 +3163,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_umin_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -3185,43 +3185,43 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3239,16 +3239,16 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -3259,16 +3259,16 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -3279,11 +3279,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3306,14 +3306,14 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3321,14 +3321,14 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3336,11 +3336,11 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3355,8 +3355,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -3373,8 +3373,8 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; ; GCN2-LABEL: atomic_or_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -3391,12 +3391,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; ; GCN3-LABEL: atomic_or_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3414,18 +3414,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3433,18 +3433,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN2-LABEL: atomic_or_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3452,11 +3452,11 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN3-LABEL: atomic_or_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3476,18 +3476,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3498,18 +3498,18 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3520,11 +3520,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3548,12 +3548,12 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3561,12 +3561,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3574,11 +3574,11 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3592,8 +3592,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3608,8 +3608,8 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3624,12 +3624,12 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3646,16 +3646,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3663,16 +3663,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN2-LABEL: atomic_or_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3680,11 +3680,11 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN3-LABEL: atomic_or_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3703,16 +3703,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3723,16 +3723,16 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; ; GCN2-LABEL: atomic_or_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3743,11 +3743,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; ; GCN3-LABEL: atomic_or_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3770,14 +3770,14 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3785,14 +3785,14 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3800,11 +3800,11 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3819,14 +3819,14 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; GCN1-LABEL: atomic_xchg_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3834,14 +3834,14 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN2-LABEL: atomic_xchg_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3849,11 +3849,11 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN3-LABEL: atomic_xchg_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3868,8 +3868,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -3886,8 +3886,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_xchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -3904,12 +3904,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_xchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3927,18 +3927,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3946,18 +3946,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN2-LABEL: atomic_xchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3965,11 +3965,11 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN3-LABEL: atomic_xchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3989,18 +3989,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4011,18 +4011,18 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4033,11 +4033,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4061,12 +4061,12 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4074,12 +4074,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4087,11 +4087,11 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4105,8 +4105,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4121,8 +4121,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4137,12 +4137,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4159,16 +4159,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4176,16 +4176,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4193,11 +4193,11 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4216,16 +4216,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4236,16 +4236,16 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4256,11 +4256,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4285,7 +4285,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4300,7 +4300,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN2-LABEL: atomic_cmpxchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4315,7 +4315,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN3-LABEL: atomic_cmpxchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 @@ -4334,8 +4334,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 16 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -4353,8 +4353,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 16 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -4372,13 +4372,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4397,19 +4397,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s2 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4418,19 +4418,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s2 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4439,12 +4439,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4465,19 +4465,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v1, s2 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4489,19 +4489,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v1, s2 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4513,12 +4513,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4544,7 +4544,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 @@ -4557,7 +4557,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN2-LABEL: atomic_cmpxchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 @@ -4570,7 +4570,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN3-LABEL: atomic_cmpxchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 @@ -4588,8 +4588,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 @@ -4605,8 +4605,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 @@ -4622,13 +4622,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4646,17 +4646,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s2 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4665,17 +4665,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s2 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4684,12 +4684,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4709,17 +4709,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v1, s2 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4731,17 +4731,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v1, s2 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4753,12 +4753,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4783,14 +4783,14 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4798,14 +4798,14 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4813,11 +4813,11 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4832,8 +4832,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -4850,8 +4850,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_xor_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -4868,12 +4868,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_xor_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4891,18 +4891,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4910,18 +4910,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_xor_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4929,11 +4929,11 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_xor_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4953,18 +4953,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4975,18 +4975,18 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4997,11 +4997,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5025,12 +5025,12 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5038,12 +5038,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5051,11 +5051,11 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5069,8 +5069,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -5085,8 +5085,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -5101,12 +5101,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -5123,16 +5123,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5140,16 +5140,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xor_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5157,11 +5157,11 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xor_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5180,16 +5180,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5200,16 +5200,16 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5220,11 +5220,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5247,7 +5247,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5279,7 +5279,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5300,7 +5300,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5314,7 +5314,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5328,7 +5328,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5348,8 +5348,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5368,8 +5368,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5388,10 +5388,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5414,8 +5414,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5432,8 +5432,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5450,10 +5450,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5475,37 +5475,37 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -5518,33 +5518,33 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -5556,8 +5556,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5572,8 +5572,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5588,15 +5588,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_store_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -5609,8 +5609,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5623,8 +5623,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; ; GCN2-LABEL: atomic_store_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5637,15 +5637,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; ; GCN3-LABEL: atomic_store_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5657,7 +5657,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5673,7 +5673,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5689,7 +5689,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5710,7 +5710,7 @@ entry: define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5724,7 +5724,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5738,7 +5738,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5758,8 +5758,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5778,8 +5778,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5798,10 +5798,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5824,8 +5824,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5842,8 +5842,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5860,10 +5860,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5885,37 +5885,37 @@ entry: define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -5928,33 +5928,33 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -5966,8 +5966,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5982,8 +5982,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; ; GCN2-LABEL: atomic_store_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5998,15 +5998,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; ; GCN3-LABEL: atomic_store_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6019,8 +6019,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -6033,8 +6033,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; ; GCN2-LABEL: atomic_store_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -6047,15 +6047,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; ; GCN3-LABEL: atomic_store_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6067,7 +6067,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6083,7 +6083,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6099,7 +6099,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6120,7 +6120,7 @@ entry: define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6134,7 +6134,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6148,7 +6148,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6168,8 +6168,8 @@ entry: define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 @@ -6187,8 +6187,8 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; ; GCN2-LABEL: atomic_load_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 @@ -6206,11 +6206,11 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; ; GCN3-LABEL: atomic_load_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s0, s4, s2 -; GCN3-NEXT: s_addc_u32 s1, s5, s3 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc @@ -6231,37 +6231,37 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6274,33 +6274,33 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6312,8 +6312,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, s6 ; GCN1-NEXT: s_addc_u32 s1, s5, s7 @@ -6327,8 +6327,8 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; ; GCN2-LABEL: atomic_store_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, s6 ; GCN2-NEXT: s_addc_u32 s1, s5, s7 @@ -6342,14 +6342,14 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; ; GCN3-LABEL: atomic_store_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s0, s4, s6 ; GCN3-NEXT: s_addc_u32 s1, s5, s7 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6362,7 +6362,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6378,7 +6378,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6394,7 +6394,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6415,7 +6415,7 @@ entry: define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6429,7 +6429,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6443,7 +6443,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6463,8 +6463,8 @@ entry: define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -6483,8 +6483,8 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -6503,10 +6503,10 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -6529,37 +6529,37 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6572,33 +6572,33 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6610,8 +6610,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -6626,8 +6626,8 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -6642,15 +6642,15 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_store_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6663,37 +6663,37 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6706,33 +6706,33 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6744,33 +6744,33 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6782,33 +6782,33 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6819,14 +6819,14 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6834,14 +6834,14 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6849,11 +6849,11 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6868,14 +6868,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6883,14 +6883,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6898,11 +6898,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6917,14 +6917,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6932,14 +6932,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6947,11 +6947,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -6968,8 +6968,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -6986,8 +6986,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_inc_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -7004,12 +7004,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_inc_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7027,18 +7027,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7046,18 +7046,18 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_inc_i32_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7065,11 +7065,11 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_inc_i32_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7089,18 +7089,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7111,18 +7111,18 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7133,11 +7133,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7161,12 +7161,12 @@ entry: define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7174,12 +7174,12 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7187,11 +7187,11 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7205,8 +7205,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -7221,8 +7221,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -7237,12 +7237,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7259,16 +7259,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7276,16 +7276,16 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_inc_i32_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7293,11 +7293,11 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_inc_i32_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7316,16 +7316,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7336,16 +7336,16 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7356,11 +7356,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7383,14 +7383,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 16 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7398,14 +7398,14 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 16 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7413,11 +7413,11 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7432,14 +7432,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7447,14 +7447,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7462,11 +7462,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7481,14 +7481,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7496,14 +7496,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s3, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7511,11 +7511,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -7532,8 +7532,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -7550,8 +7550,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_dec_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -7568,12 +7568,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_dec_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7591,18 +7591,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7610,18 +7610,18 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_dec_i32_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7629,11 +7629,11 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_dec_i32_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7653,18 +7653,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7675,18 +7675,18 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7697,11 +7697,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7725,12 +7725,12 @@ entry: define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7738,12 +7738,12 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7751,11 +7751,11 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7769,8 +7769,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -7785,8 +7785,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -7801,12 +7801,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7823,16 +7823,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7840,16 +7840,16 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_dec_i32_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7857,11 +7857,11 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_dec_i32_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7880,16 +7880,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7900,16 +7900,16 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7920,11 +7920,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7947,7 +7947,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -7963,7 +7963,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -7979,7 +7979,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -7999,7 +7999,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8013,7 +8013,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8027,7 +8027,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -8046,7 +8046,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -8062,7 +8062,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -8078,7 +8078,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -8098,7 +8098,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8112,7 +8112,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8126,7 +8126,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 5bd527149572e..4d80e9124f41f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -3823,7 +3823,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -3853,7 +3853,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -3883,7 +3883,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -3918,8 +3918,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -3953,8 +3953,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -3988,32 +3988,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4029,7 +4029,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4057,7 +4057,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -4085,7 +4085,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -4119,8 +4119,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -4152,8 +4152,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -4185,32 +4185,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4966,7 +4966,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4996,7 +4996,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -5026,7 +5026,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -5061,8 +5061,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -5096,8 +5096,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -5131,32 +5131,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -5172,8 +5172,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -5205,8 +5205,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -5238,32 +5238,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6760,7 +6760,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -6790,7 +6790,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -6820,7 +6820,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -6855,8 +6855,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -6890,8 +6890,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -6925,32 +6925,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6966,8 +6966,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN1-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 @@ -6990,8 +6990,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN2-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 @@ -7014,17 +7014,17 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN3-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s4, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7043,8 +7043,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -7076,8 +7076,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -7109,32 +7109,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s3, 31 -; GCN3-NEXT: s_mov_b32 s0, s3 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: s_ashr_i32 s3, s1, 31 +; GCN3-NEXT: s_mov_b32 s2, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s2, s4, s2 +; GCN3-NEXT: s_addc_u32 s3, s5, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index b8c8d993d389b..5420733b7dc55 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -21,7 +21,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -36,7 +36,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -53,8 +53,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -72,8 +72,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_add_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -92,8 +92,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -113,8 +113,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -132,8 +132,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_add_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -152,8 +152,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_add_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -174,7 +174,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -195,7 +195,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -216,7 +216,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -240,7 +240,7 @@ entry: define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -266,7 +266,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -282,8 +282,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -299,8 +299,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -317,8 +317,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_add_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -337,8 +337,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -354,8 +354,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_add_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -372,8 +372,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_add_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -393,7 +393,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -412,7 +412,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_add_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -431,7 +431,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -454,7 +454,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_and_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -469,7 +469,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -484,7 +484,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -501,8 +501,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -520,8 +520,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -540,8 +540,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -561,8 +561,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -580,8 +580,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_and_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -600,8 +600,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_and_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -622,7 +622,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -643,7 +643,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -664,7 +664,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -688,7 +688,7 @@ entry: define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_and_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -701,7 +701,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -714,7 +714,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -730,8 +730,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -747,8 +747,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -765,8 +765,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_and_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -785,8 +785,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -802,8 +802,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_and_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -820,8 +820,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_and_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -841,7 +841,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -860,7 +860,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_and_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -879,7 +879,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -902,7 +902,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -917,7 +917,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -932,7 +932,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -949,8 +949,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -968,8 +968,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -988,8 +988,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1009,8 +1009,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1028,8 +1028,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_sub_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1048,8 +1048,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_sub_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1070,7 +1070,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1091,7 +1091,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1136,7 +1136,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_sub_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -1162,7 +1162,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1178,8 +1178,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1195,8 +1195,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1213,8 +1213,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_sub_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1233,8 +1233,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1250,8 +1250,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_sub_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1268,8 +1268,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_sub_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1289,7 +1289,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1308,7 +1308,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1327,7 +1327,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1350,7 +1350,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_max_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -1364,7 +1364,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -1378,7 +1378,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1395,8 +1395,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -1414,8 +1414,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_max_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -1434,8 +1434,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1455,8 +1455,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1473,8 +1473,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_max_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1492,8 +1492,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_max_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1514,7 +1514,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1535,7 +1535,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1556,7 +1556,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1580,7 +1580,7 @@ entry: define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_max_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -1604,7 +1604,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1620,8 +1620,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1637,8 +1637,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1655,8 +1655,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_max_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1675,8 +1675,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1691,8 +1691,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_max_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1708,8 +1708,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_max_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1729,7 +1729,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1748,7 +1748,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1767,7 +1767,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1790,7 +1790,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -1804,7 +1804,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -1818,7 +1818,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1835,8 +1835,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -1854,8 +1854,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umax_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -1874,8 +1874,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1895,8 +1895,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1913,8 +1913,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_umax_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1932,8 +1932,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umax_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1954,7 +1954,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1975,7 +1975,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1996,7 +1996,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2020,7 +2020,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umax_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2032,7 +2032,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2044,7 +2044,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2060,8 +2060,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2077,8 +2077,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2095,8 +2095,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_umax_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2115,8 +2115,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2131,8 +2131,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_umax_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2148,8 +2148,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umax_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2169,7 +2169,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2188,7 +2188,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2207,7 +2207,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2230,7 +2230,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -2244,7 +2244,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2275,8 +2275,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -2294,8 +2294,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_min_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -2314,8 +2314,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2335,8 +2335,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2353,8 +2353,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_min_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2372,8 +2372,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_min_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2394,7 +2394,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2415,7 +2415,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2436,7 +2436,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2460,7 +2460,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2472,7 +2472,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2484,7 +2484,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2500,8 +2500,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2517,8 +2517,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2535,8 +2535,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_min_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2555,8 +2555,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2571,8 +2571,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_min_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2588,8 +2588,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_min_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2609,7 +2609,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2628,7 +2628,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2647,7 +2647,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2670,7 +2670,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -2684,7 +2684,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -2698,7 +2698,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2715,8 +2715,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -2734,8 +2734,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umin_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -2754,8 +2754,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2775,8 +2775,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2793,8 +2793,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_umin_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2812,8 +2812,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umin_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2834,7 +2834,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2855,7 +2855,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2876,7 +2876,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2900,7 +2900,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umin_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2924,7 +2924,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2940,8 +2940,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2957,8 +2957,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2975,8 +2975,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_umin_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2995,8 +2995,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3011,8 +3011,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_umin_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3028,8 +3028,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umin_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3049,7 +3049,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3068,7 +3068,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3087,7 +3087,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3110,7 +3110,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_or_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3125,7 +3125,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3140,7 +3140,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3157,8 +3157,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -3176,8 +3176,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; ; GCN2-LABEL: atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -3196,8 +3196,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3217,8 +3217,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3236,8 +3236,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; ; GCN2-LABEL: atomic_or_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3256,8 +3256,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-LABEL: atomic_or_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3278,7 +3278,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3299,7 +3299,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3320,7 +3320,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3344,7 +3344,7 @@ entry: define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_or_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -3357,7 +3357,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -3370,7 +3370,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3386,8 +3386,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3403,8 +3403,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3421,8 +3421,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_or_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3441,8 +3441,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3458,8 +3458,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; ; GCN2-LABEL: atomic_or_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3476,8 +3476,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-LABEL: atomic_or_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3497,7 +3497,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3516,7 +3516,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; ; GCN2-LABEL: atomic_or_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3535,7 +3535,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3558,7 +3558,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3573,7 +3573,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xchg_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3588,7 +3588,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3605,7 +3605,7 @@ entry: define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN1-LABEL: atomic_xchg_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3620,7 +3620,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; ; GCN2-LABEL: atomic_xchg_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3635,7 +3635,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3652,7 +3652,7 @@ entry: define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN1-LABEL: atomic_xchg_pointer_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3667,7 +3667,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; ; GCN2-LABEL: atomic_xchg_pointer_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3682,7 +3682,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3699,8 +3699,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -3718,8 +3718,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_xchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -3738,8 +3738,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3759,8 +3759,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3778,8 +3778,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_xchg_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3798,8 +3798,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3820,7 +3820,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3841,7 +3841,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3862,7 +3862,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3886,7 +3886,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -3899,7 +3899,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xchg_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -3912,7 +3912,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3928,8 +3928,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3945,8 +3945,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_xchg_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3963,8 +3963,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_xchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3983,8 +3983,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4000,8 +4000,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_xchg_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4018,8 +4018,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4039,7 +4039,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4058,7 +4058,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4077,7 +4077,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4100,7 +4100,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4115,7 +4115,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4130,7 +4130,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4147,8 +4147,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -4166,8 +4166,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -4186,8 +4186,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -4207,8 +4207,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4226,8 +4226,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_xor_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4246,8 +4246,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_xor_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4268,7 +4268,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4289,7 +4289,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4310,7 +4310,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4334,7 +4334,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xor_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -4347,7 +4347,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -4360,7 +4360,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4376,8 +4376,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4393,8 +4393,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4411,8 +4411,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_xor_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -4431,8 +4431,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4448,8 +4448,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_xor_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4466,8 +4466,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xor_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4487,7 +4487,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4506,7 +4506,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4525,7 +4525,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4548,7 +4548,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4564,7 +4564,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4580,7 +4580,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4599,7 +4599,7 @@ entry: define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -4613,7 +4613,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -4627,7 +4627,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4645,8 +4645,8 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4665,8 +4665,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4686,8 +4686,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-LABEL: atomic_load_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4710,8 +4710,8 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4728,8 +4728,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4747,8 +4747,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-LABEL: atomic_load_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4770,7 +4770,7 @@ entry: define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_u32 s0, s2, 32 @@ -4783,7 +4783,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_u32 s0, s2, 32 @@ -4796,7 +4796,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4811,7 +4811,7 @@ entry: define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -4822,7 +4822,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -4833,7 +4833,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4847,8 +4847,8 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4864,8 +4864,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4882,8 +4882,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -4902,8 +4902,8 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4917,8 +4917,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; ; GCN2-LABEL: atomic_store_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4933,8 +4933,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -4952,8 +4952,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 32 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -4970,8 +4970,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; ; GCN2-LABEL: atomic_cmpxchg_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 32 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -4989,8 +4989,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-LABEL: atomic_cmpxchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5008,8 +5008,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_soffset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 0x11940 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -5026,8 +5026,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; ; GCN2-LABEL: atomic_cmpxchg_i64_soffset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 0x11940 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -5045,8 +5045,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-LABEL: atomic_cmpxchg_i64_soffset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5064,7 +5064,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5084,7 +5084,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5104,7 +5104,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -5126,7 +5126,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5146,7 +5146,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; ; GCN2-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5166,7 +5166,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5187,8 +5187,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -5211,8 +5211,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -5236,8 +5236,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 @@ -5262,8 +5262,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 @@ -5278,8 +5278,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; ; GCN2-LABEL: atomic_cmpxchg_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 @@ -5295,8 +5295,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-LABEL: atomic_cmpxchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5313,7 +5313,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: v_mov_b32_e32 v5, s1 @@ -5331,7 +5331,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: v_mov_b32_e32 v5, s1 @@ -5349,7 +5349,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -5370,7 +5370,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5388,7 +5388,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; ; GCN2-LABEL: atomic_cmpxchg_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5406,7 +5406,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5426,8 +5426,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN1-NEXT: s_add_u32 s2, s4, s2 @@ -5448,8 +5448,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN2-NEXT: s_add_u32 s2, s4, s2 @@ -5471,8 +5471,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 @@ -5496,7 +5496,7 @@ entry: define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5512,7 +5512,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5528,7 +5528,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5547,7 +5547,7 @@ entry: define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5561,7 +5561,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5575,7 +5575,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5593,8 +5593,8 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5613,8 +5613,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_f64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5634,8 +5634,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-LABEL: atomic_load_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5658,8 +5658,8 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5676,8 +5676,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_f64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5695,8 +5695,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-LABEL: atomic_load_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5718,7 +5718,7 @@ entry: define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GCN1-LABEL: atomic_store_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_u32 s0, s2, 32 @@ -5731,7 +5731,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_u32 s0, s2, 32 @@ -5744,7 +5744,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5759,7 +5759,7 @@ entry: define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GCN1-LABEL: atomic_store_f64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5770,7 +5770,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5781,7 +5781,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5795,8 +5795,8 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5812,8 +5812,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; ; GCN2-LABEL: atomic_store_f64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5830,8 +5830,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -5850,8 +5850,8 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5865,8 +5865,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; ; GCN2-LABEL: atomic_store_f64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5881,8 +5881,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-LABEL: atomic_store_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -5900,7 +5900,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5915,7 +5915,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5930,7 +5930,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5947,8 +5947,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -5966,8 +5966,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_inc_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -5986,8 +5986,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6007,8 +6007,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6026,8 +6026,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_inc_i64_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6046,8 +6046,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_inc_i64_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6068,7 +6068,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6089,7 +6089,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6110,7 +6110,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6134,7 +6134,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_inc_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6147,7 +6147,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6160,7 +6160,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_inc_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6176,8 +6176,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6193,8 +6193,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6211,8 +6211,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_inc_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6231,8 +6231,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6248,8 +6248,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_inc_i64_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6266,8 +6266,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_inc_i64_incr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6287,7 +6287,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6306,7 +6306,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6325,7 +6325,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6348,7 +6348,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6363,7 +6363,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6378,7 +6378,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6395,8 +6395,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -6414,8 +6414,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_dec_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -6434,8 +6434,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6455,8 +6455,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6474,8 +6474,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_dec_i64_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6494,8 +6494,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_dec_i64_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6516,7 +6516,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6537,7 +6537,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6558,7 +6558,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6582,7 +6582,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_dec_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6595,7 +6595,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6608,7 +6608,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_dec_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6624,8 +6624,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6641,8 +6641,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6659,8 +6659,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_dec_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6679,8 +6679,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6696,8 +6696,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_dec_i64_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6714,8 +6714,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_dec_i64_decr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6735,7 +6735,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6754,7 +6754,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6773,7 +6773,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index d812b4b7d86e6..7e4a36b7dc11b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -4258,8 +4258,8 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4292,8 +4292,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_max_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4326,10 +4326,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN3-LABEL: atomic_max_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -4365,7 +4365,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -4402,7 +4402,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -4439,7 +4439,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -4482,8 +4482,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4514,8 +4514,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_max_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4546,10 +4546,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN3-LABEL: atomic_max_i64_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -4584,7 +4584,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -4619,7 +4619,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -4654,7 +4654,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN3-LABEL: atomic_max_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -5640,8 +5640,8 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5674,8 +5674,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_umax_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5708,10 +5708,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN3-LABEL: atomic_umax_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -5747,7 +5747,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -5784,7 +5784,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -5821,7 +5821,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -5864,7 +5864,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -5899,7 +5899,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -5934,7 +5934,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -7864,8 +7864,8 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -7898,8 +7898,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_min_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -7932,10 +7932,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN3-LABEL: atomic_min_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -7971,7 +7971,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -8008,7 +8008,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -8045,7 +8045,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -8088,7 +8088,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -8118,7 +8118,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -8148,7 +8148,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GCN3-LABEL: atomic_min_i64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -8183,7 +8183,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -8218,7 +8218,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -8253,7 +8253,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN3-LABEL: atomic_min_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index bac2d8b8b40c2..4846e21fe836e 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -21,7 +21,7 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -41,7 +41,9 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -75,7 +77,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -101,7 +103,9 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -146,7 +150,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -166,7 +170,9 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -200,7 +206,7 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -220,7 +226,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -254,7 +262,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -280,7 +288,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -325,7 +335,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -345,7 +355,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -379,7 +391,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -405,7 +417,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -450,7 +464,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -470,7 +484,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -506,7 +522,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -532,7 +548,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -579,7 +597,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -605,7 +623,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -652,7 +672,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -678,7 +698,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -703,7 +723,9 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -727,7 +749,9 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -774,7 +798,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -800,7 +824,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -825,7 +849,9 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -849,7 +875,9 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -899,56 +927,56 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -966,8 +994,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -992,56 +1020,56 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1059,8 +1087,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1085,55 +1113,55 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1152,8 +1180,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1178,55 +1206,55 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1245,8 +1273,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1271,55 +1299,55 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1338,8 +1366,8 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1364,55 +1392,55 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1431,8 +1459,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1457,55 +1485,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1524,8 +1552,8 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1550,55 +1578,55 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1617,8 +1645,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1643,55 +1671,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1710,8 +1738,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1736,55 +1764,55 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1803,8 +1831,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1829,55 +1857,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1896,8 +1924,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1922,55 +1950,55 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s2, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s3 -; SI-NOFMA-NEXT: s_mov_b32 s10, s2 -; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s2, s10 +; SI-NOFMA-NEXT: s_mov_b32 s3, s11 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s0, s4 -; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s4 +; SI-NOFMA-NEXT: s_mov_b32 s9, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s2, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s3 -; SI-FMA-NEXT: s_mov_b32 s10, s2 -; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s2, s10 +; SI-FMA-NEXT: s_mov_b32 s3, s11 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s0, s4 -; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s4 +; SI-FMA-NEXT: s_mov_b32 s9, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1989,8 +2017,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -2019,7 +2047,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_interp: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s10, -1 ; SI-NOFMA-NEXT: s_mov_b32 s14, s10 @@ -2051,7 +2079,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; SI-FMA-LABEL: test_f32_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2081,7 +2109,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f32_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2102,7 +2130,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f32_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2135,7 +2163,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; SI-FMA-LABEL: test_f64_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2165,7 +2193,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f64_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2186,7 +2214,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f64_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2220,7 +2248,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_neg_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2236,7 +2264,9 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -2266,7 +2296,7 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2282,7 +2312,9 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: fma_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -2312,7 +2344,7 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 { ; SI-LABEL: fma_neg_b_c_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0 @@ -2333,7 +2365,9 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: fma_neg_b_c_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll index 93ed64d93b8ba..39a9a85081af5 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.ll @@ -159,15 +159,15 @@ define float @fold_fmul_distributive(float %x, float %y) { define amdgpu_kernel void @vec_mul_scalar_add_fma(<2 x float> %a, <2 x float> %b, float %c1, ptr addrspace(1) %inptr) { ; GFX906-LABEL: vec_mul_scalar_add_fma: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX906-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, s8 ; GFX906-NEXT: v_mov_b32_e32 v2, s6 ; GFX906-NEXT: v_fmac_f32_e32 v1, s4, v2 -; GFX906-NEXT: global_store_dword v0, v1, s[2:3] offset:4 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] offset:4 ; GFX906-NEXT: s_endpgm %gep = getelementptr float, ptr addrspace(1) %inptr, i32 1 %c = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 23eb73038917d..84852c2632f67 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -37,7 +37,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -97,7 +97,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -139,7 +139,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -169,7 +169,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -199,7 +199,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -229,7 +229,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -270,7 +270,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -304,7 +304,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -338,7 +338,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -368,7 +368,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -410,7 +410,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -444,7 +444,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -478,7 +478,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -508,7 +508,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll index 01b2f207388e8..018399983a863 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_uge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -80,7 +80,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_oge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -111,7 +111,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -132,7 +132,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_ugt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -163,7 +163,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -184,7 +184,7 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_ogt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index 87ac95a1cd739..3b7009023b03a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -262,8 +262,8 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fmaximumi_f32_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS @@ -286,8 +286,8 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fmaximum_f16_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 764fb992d4d34..84099e472d65f 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -28,7 +28,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -45,7 +45,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -63,7 +63,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -83,7 +83,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -95,13 +95,14 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -122,7 +123,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -139,7 +140,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -156,7 +157,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -174,7 +175,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -194,7 +195,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -206,13 +207,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -234,7 +236,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -251,7 +253,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -268,7 +270,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -286,7 +288,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -306,7 +308,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -318,13 +320,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -346,7 +349,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -363,7 +366,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -380,7 +383,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -398,7 +401,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -418,7 +421,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -430,13 +433,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -458,7 +462,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -476,7 +480,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -494,7 +498,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -513,7 +517,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -534,7 +538,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -547,13 +551,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 4.0, 2.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -575,7 +580,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -596,7 +601,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -617,7 +622,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -639,7 +644,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -663,7 +668,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -679,7 +684,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -695,14 +700,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX11-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1 ; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -714,13 +721,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX11-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v2, v1, 2.0, 4.0 ; GFX11-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc @@ -747,7 +755,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -765,7 +773,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -783,7 +791,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -802,7 +810,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_fmed3_r_i_i_f64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -823,7 +831,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_fmed3_r_i_i_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -836,14 +844,16 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_fmed3_r_i_i_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -865,7 +875,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -881,7 +891,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -897,7 +907,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -914,7 +924,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -933,7 +943,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -944,7 +954,9 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -969,7 +981,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -986,7 +998,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1004,7 +1016,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1022,7 +1034,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1045,7 +1057,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1057,7 +1069,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -1072,13 +1084,14 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -1087,15 +1100,17 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX11-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 2.0, v1 ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 4.0, v1 ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1123,7 +1138,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1147,7 +1162,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1170,7 +1185,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1197,7 +1212,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1229,7 +1244,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1244,7 +1259,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1260,7 +1275,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1277,7 +1294,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1313,7 +1332,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1337,7 +1356,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1360,7 +1379,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1387,7 +1406,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1419,7 +1438,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1434,7 +1453,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1450,7 +1469,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1467,7 +1488,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1503,7 +1526,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1527,7 +1550,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1550,7 +1573,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1577,7 +1600,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1609,7 +1632,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1624,7 +1647,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1640,7 +1663,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1657,7 +1682,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1693,7 +1720,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1717,7 +1744,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1741,7 +1768,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1768,7 +1795,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1801,7 +1828,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1816,7 +1843,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1833,7 +1860,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1850,7 +1879,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1893,7 +1924,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1917,7 +1948,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1942,7 +1973,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1969,7 +2000,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2003,7 +2034,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2018,7 +2049,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2036,7 +2067,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2053,7 +2086,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2099,7 +2134,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2126,7 +2161,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2151,7 +2186,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2181,7 +2216,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2215,7 +2250,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2233,7 +2268,9 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2274,7 +2311,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2298,7 +2335,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; SI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2320,7 +2357,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2347,7 +2384,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; VI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2378,7 +2415,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2393,7 +2430,9 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2426,7 +2465,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2450,7 +2489,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_nnan_call_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2472,7 +2511,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2499,7 +2538,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_nnan_call_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2530,7 +2569,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_nnan_call_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2545,7 +2584,9 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_nnan_call_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2578,7 +2619,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2602,7 +2643,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_fast_call_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2624,7 +2665,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2651,7 +2692,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_fast_call_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2682,7 +2723,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_fast_call_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2697,7 +2738,9 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_fast_call_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2742,7 +2785,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2766,7 +2809,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2788,7 +2831,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2815,7 +2858,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2846,7 +2889,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2861,7 +2904,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2894,7 +2939,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2918,7 +2963,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2940,7 +2985,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2967,7 +3012,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2998,7 +3043,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3013,7 +3058,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3046,7 +3093,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3070,7 +3117,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3093,7 +3140,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3120,7 +3167,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3152,7 +3199,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3167,7 +3214,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3183,7 +3230,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3200,7 +3249,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3236,7 +3287,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3260,7 +3311,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3282,7 +3333,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3309,7 +3360,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3340,7 +3391,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3355,7 +3406,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3388,7 +3441,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3412,7 +3465,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3434,7 +3487,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3461,7 +3514,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3492,7 +3545,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3507,7 +3560,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3540,7 +3595,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3564,7 +3619,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3586,7 +3641,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3613,7 +3668,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3644,7 +3699,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3659,7 +3714,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3692,7 +3749,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3716,7 +3773,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3738,7 +3795,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3765,7 +3822,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3796,7 +3853,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3811,7 +3868,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3844,7 +3903,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3868,7 +3927,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3890,7 +3949,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3917,7 +3976,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3948,7 +4007,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3963,7 +4022,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3996,7 +4057,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4020,7 +4081,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4042,7 +4103,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4069,7 +4130,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4100,7 +4161,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4115,7 +4176,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4148,7 +4211,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4172,7 +4235,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4194,7 +4257,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4221,7 +4284,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4252,7 +4315,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4267,7 +4330,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4300,7 +4365,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4324,7 +4389,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4346,7 +4411,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4373,7 +4438,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4404,7 +4469,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4419,7 +4484,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4452,7 +4519,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4476,7 +4543,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4498,7 +4565,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4525,7 +4592,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4556,7 +4623,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4571,7 +4638,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4604,7 +4673,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4628,7 +4697,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4650,7 +4719,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4677,7 +4746,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4708,7 +4777,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4723,7 +4792,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4756,7 +4827,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4780,7 +4851,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4802,7 +4873,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4829,7 +4900,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4860,7 +4931,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4875,7 +4946,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4908,7 +4981,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4932,7 +5005,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4954,7 +5027,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4981,7 +5054,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5012,7 +5085,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5027,7 +5100,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5060,7 +5135,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5084,7 +5159,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5106,7 +5181,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5133,7 +5208,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5164,7 +5239,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5179,7 +5254,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5212,7 +5289,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5236,7 +5313,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5258,7 +5335,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5285,7 +5362,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5316,7 +5393,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5331,7 +5408,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5367,7 +5446,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5391,7 +5470,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5413,7 +5492,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5440,7 +5519,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5471,7 +5550,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5486,7 +5565,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5523,7 +5604,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5556,7 +5637,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5588,7 +5669,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5623,7 +5704,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5662,7 +5743,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5685,7 +5766,9 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5725,7 +5808,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5758,7 +5841,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5790,7 +5873,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5825,7 +5908,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5864,7 +5947,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5887,7 +5970,9 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5896,9 +5981,10 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-SDAG-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v1, v2 +; GFX11-SDAG-NEXT: v_max_f32_e32 v4, v1, v2 ; GFX11-SDAG-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3 @@ -5911,7 +5997,9 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5951,7 +6039,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5984,7 +6072,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6016,7 +6104,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6051,7 +6139,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6090,7 +6178,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6113,7 +6201,9 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6153,7 +6243,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6182,7 +6272,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6210,7 +6300,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6243,7 +6333,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6280,7 +6370,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6301,7 +6391,9 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6322,7 +6414,9 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6359,7 +6453,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6386,7 +6480,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6411,7 +6505,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6441,7 +6535,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6475,7 +6569,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6493,7 +6587,9 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6534,7 +6630,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6561,7 +6657,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6586,7 +6682,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6616,7 +6712,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6650,7 +6746,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6668,7 +6764,9 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6709,7 +6807,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6736,7 +6834,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6761,7 +6859,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6791,7 +6889,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6825,7 +6923,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6843,7 +6941,9 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6884,7 +6984,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6908,7 +7008,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6931,7 +7031,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6958,7 +7058,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6990,7 +7090,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7005,7 +7105,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7021,7 +7121,9 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7038,7 +7140,9 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7074,7 +7178,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7100,7 +7204,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7126,7 +7230,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7156,7 +7260,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7191,7 +7295,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7209,7 +7313,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7228,7 +7332,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7247,7 +7353,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7286,7 +7394,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_min_max_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7311,7 +7419,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_global_nnans_min_max_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7334,7 +7442,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7362,7 +7470,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_global_nnans_min_max_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7394,7 +7502,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_global_nnans_min_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7410,7 +7518,9 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; GFX11-LABEL: v_test_global_nnans_min_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7441,7 +7551,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -7460,7 +7570,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -7487,7 +7597,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7506,7 +7616,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7527,7 +7637,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -7539,13 +7649,14 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -7566,7 +7677,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -7597,7 +7708,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7644,7 +7755,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7677,7 +7788,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7714,7 +7825,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -7732,7 +7843,9 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -7774,7 +7887,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: two_non_inline_constant: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7792,7 +7905,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: two_non_inline_constant: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -7810,7 +7923,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: two_non_inline_constant: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7829,7 +7942,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: two_non_inline_constant: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7850,7 +7963,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: two_non_inline_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -7863,7 +7976,9 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX11-SDAG-LABEL: two_non_inline_constant: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -7879,14 +7994,15 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX11-GISEL-LABEL: two_non_inline_constant: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0.5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, 0x41000000, v2 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -7908,7 +8024,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: one_non_inline_constant: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7930,7 +8046,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: one_non_inline_constant: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -7952,7 +8068,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: one_non_inline_constant: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -7974,7 +8090,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: one_non_inline_constant: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -7998,7 +8114,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: one_non_inline_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8014,7 +8130,9 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: one_non_inline_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -8047,7 +8165,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: two_non_inline_constant_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -8073,7 +8191,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: two_non_inline_constant_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -8099,7 +8217,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: two_non_inline_constant_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8125,7 +8243,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: two_non_inline_constant_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41800000 @@ -8153,7 +8271,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8173,7 +8291,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000 @@ -8193,7 +8311,9 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX11-SDAG-LABEL: two_non_inline_constant_multi_use: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -8215,13 +8335,15 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX11-GISEL-LABEL: two_non_inline_constant_multi_use: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0x41800000 :: v_dual_add_f32 v3, 0.5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0.5, v1 ; GFX11-GISEL-NEXT: v_med3_f32 v2, v3, 0x41000000, v2 ; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0x41800000, v1 ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 7337d90b4bea6..3a55b2d50a5e5 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -37,7 +37,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -97,7 +97,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -139,7 +139,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -169,7 +169,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -199,7 +199,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -229,7 +229,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -270,7 +270,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -304,7 +304,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -338,7 +338,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -368,7 +368,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -410,7 +410,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -444,7 +444,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -478,7 +478,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -508,7 +508,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -612,7 +612,7 @@ entry: define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -646,7 +646,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -680,7 +680,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -714,7 +714,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -759,7 +759,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -793,7 +793,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -827,7 +827,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -861,7 +861,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll index d20c39d510364..85653ded63ce6 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_uge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -57,7 +57,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ugt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -109,7 +109,7 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ule_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -130,7 +130,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ule_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -161,7 +161,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ult_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -182,7 +182,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ult_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -213,7 +213,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -234,7 +234,7 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_oge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -265,7 +265,7 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -286,7 +286,7 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ogt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -317,7 +317,7 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ole_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -338,7 +338,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ole_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -369,7 +369,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_olt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -390,7 +390,7 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_olt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index 45f6bff10f45e..817e6dd87361f 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -262,8 +262,8 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fminimumi_f32_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS @@ -286,8 +286,8 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fminimum_f16_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 7830c91851bfa..c60b9858abd83 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -15,7 +15,7 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_fadd_use_test_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_fadd_use_test_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -46,7 +46,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_fadd_use_test_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -79,20 +79,20 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, float %x, [8 x i32], float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmac_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-NEXT: s_load_dword s3, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 +; VI-NEXT: s_load_dword s3, s[6:7], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s0, 4 -; VI-NEXT: v_add_f32_e64 v2, s6, s6 +; VI-NEXT: v_add_f32_e64 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mac_f32_e64 v3, s6, 2.0 +; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -101,9 +101,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX10-LABEL: multiple_use_fadd_fmac_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, s2, s2 @@ -117,13 +117,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX11-LABEL: multiple_use_fadd_fmac_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v1, s2, s2 -; GFX11-NEXT: v_fma_f32 v2, s2, 2.0, s3 +; GFX11-NEXT: v_add_f32_e64 v1, s4, s4 +; GFX11-NEXT: v_fma_f32 v2, s4, 2.0, s5 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc @@ -142,7 +142,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 @@ -161,7 +161,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_use_fadd_fmad_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -174,7 +174,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_use_fadd_fmad_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -198,21 +198,21 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_use_fadd_multi_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s6, 4 +; VI-NEXT: s_add_u32 s6, s4, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mad_f32 v2, |s0|, 2.0, v0 ; VI-NEXT: v_mad_f32 v3, |s0|, 2.0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_addc_u32 s5, s7, 0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_addc_u32 s7, s5, 0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -220,23 +220,23 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; GFX10-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_fma_f32 v1, |s0|, 2.0, s1 ; GFX10-NEXT: v_fma_f32 v2, |s0|, 2.0, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[6:7] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[6:7] offset:4 +; GFX10-NEXT: global_store_dword v0, v2, s[4:5] offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v1, |s4|, 2.0, s5 @@ -261,8 +261,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -275,8 +275,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn2_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, s2, -4.0 @@ -288,12 +288,12 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn2_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, s2, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v0, s4, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -310,8 +310,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn3_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 @@ -325,8 +325,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn3_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 @@ -338,12 +338,12 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn3_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 +; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -360,8 +360,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_fadd_use_test_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -378,8 +378,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_fadd_use_test_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -396,13 +396,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -414,12 +414,12 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dword s0, s[6:7], 0x8 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -433,14 +433,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s3, -1.0 +; GFX11-DENORM-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -448,6 +447,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_mul_f16_e32 v1, v0, v0 ; GFX11-DENORM-NEXT: v_fma_f16 v0, -v1, v0, 1.0 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-DENORM-NEXT: s_nop 0 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -455,13 +455,12 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s3, -1.0 +; GFX11-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -472,6 +471,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v0, 1.0, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -496,14 +496,14 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, s6, 2.0, v0 +; VI-DENORM-NEXT: v_fma_f16 v3, s4, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, s6, s6 +; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4 ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 @@ -517,12 +517,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, s6, s6 +; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4 ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s3 @@ -530,7 +530,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 -; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s6, 2.0 +; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) @@ -539,8 +539,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -555,8 +555,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 @@ -571,13 +571,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s2, s2 -; GFX11-DENORM-NEXT: v_fma_f16 v2, s2, 2.0, s3 +; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s4, s4 +; GFX11-DENORM-NEXT: v_fma_f16 v2, s4, 2.0, s2 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -589,12 +589,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s4, s4 +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -617,14 +617,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0 +; VI-DENORM-NEXT: v_fma_f16 v3, |s4|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, |s6|, |s6| +; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4| ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 @@ -638,14 +638,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 -; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0 +; VI-FLUSH-NEXT: v_mad_f16 v3, |s4|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s6|, |s6| +; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4| ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 @@ -660,8 +660,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -676,8 +676,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| @@ -692,13 +692,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s2|, |s2| -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, s3 +; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s4|, |s4| +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s2 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -710,12 +710,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -739,9 +739,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[6:7], 0x8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 @@ -762,9 +762,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; ; VI-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[6:7], 0x8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -786,14 +786,14 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x2 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, s1 -; GFX10-DENORM-NEXT: v_fma_f16 v1, |s6|, 2.0, s0 +; GFX10-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 +; GFX10-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3] offset:2 @@ -803,12 +803,12 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s6|, |s6| +; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| ; GFX10-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0 @@ -821,17 +821,17 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x2 -; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[0:1], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s3 -; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s2 -; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-DENORM-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 +; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 +; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[2:3] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc +; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3] offset:2 dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: s_nop 0 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -840,19 +840,19 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x2 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[0:1], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 -; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s3, v0 -; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[0:1] dlc +; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 +; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0 +; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[2:3] dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc +; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[2:3] offset:2 dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -873,8 +873,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 @@ -887,8 +887,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn2_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, s2, -4.0 @@ -900,13 +900,13 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn2_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, s2, -4.0 +; GFX11-NEXT: v_mul_f16_e64 v0, s4, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -925,8 +925,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn3_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 @@ -940,8 +940,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn3_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, 0xc600, s2 @@ -953,13 +953,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn3_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s2 +; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index 98faaacf1dfb0..7c1c970b3fef7 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -7,58 +7,58 @@ define amdgpu_kernel void @fmul_f16( ; SI-LABEL: fmul_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; GFX89-LABEL: fmul_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_mov_b32 s14, s2 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_mov_b32 s11, 0xf000 +; GFX89-NEXT: s_mov_b32 s10, -1 +; GFX89-NEXT: s_mov_b32 s14, s10 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s12, s6 ; GFX89-NEXT: s_mov_b32 s13, s7 -; GFX89-NEXT: s_mov_b32 s15, s3 -; GFX89-NEXT: s_mov_b32 s10, s2 -; GFX89-NEXT: s_mov_b32 s11, s3 +; GFX89-NEXT: s_mov_b32 s15, s11 +; GFX89-NEXT: s_mov_b32 s2, s10 +; GFX89-NEXT: s_mov_b32 s3, s11 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; GFX89-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s0, s4 -; GFX89-NEXT: s_mov_b32 s1, s5 +; GFX89-NEXT: s_mov_b32 s8, s4 +; GFX89-NEXT: s_mov_b32 s9, s5 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -93,7 +93,7 @@ entry: define amdgpu_kernel void @fmul_f16_imm_a( ; SI-LABEL: fmul_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -113,7 +113,7 @@ define amdgpu_kernel void @fmul_f16_imm_a( ; ; GFX89-LABEL: fmul_f16_imm_a: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @fmul_f16_imm_a( ; ; GFX11-LABEL: fmul_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -160,7 +160,7 @@ entry: define amdgpu_kernel void @fmul_f16_imm_b( ; SI-LABEL: fmul_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @fmul_f16_imm_b( ; ; GFX89-LABEL: fmul_f16_imm_b: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -198,7 +198,7 @@ define amdgpu_kernel void @fmul_f16_imm_b( ; ; GFX11-LABEL: fmul_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -227,21 +227,21 @@ entry: define amdgpu_kernel void @fmul_v2f16( ; SI-LABEL: fmul_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -256,60 +256,60 @@ define amdgpu_kernel void @fmul_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fmul_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fmul_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s3 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -343,7 +343,7 @@ entry: define amdgpu_kernel void @fmul_v2f16_imm_a( ; SI-LABEL: fmul_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; VI-LABEL: fmul_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -390,7 +390,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; GFX9-LABEL: fmul_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -409,7 +409,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; GFX11-LABEL: fmul_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -438,7 +438,7 @@ entry: define amdgpu_kernel void @fmul_v2f16_imm_b( ; SI-LABEL: fmul_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -464,7 +464,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; VI-LABEL: fmul_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -485,7 +485,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; GFX9-LABEL: fmul_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -504,7 +504,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; GFX11-LABEL: fmul_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -533,21 +533,21 @@ entry: define amdgpu_kernel void @fmul_v4f16( ; SI-LABEL: fmul_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -574,26 +574,26 @@ define amdgpu_kernel void @fmul_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fmul_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v1, v3, v1 @@ -601,37 +601,37 @@ define amdgpu_kernel void @fmul_v4f16( ; VI-NEXT: v_mul_f16_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fmul_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1 ; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -666,7 +666,7 @@ entry: define amdgpu_kernel void @fmul_v4f16_imm_a( ; SI-LABEL: fmul_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -701,7 +701,7 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; VI-LABEL: fmul_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -725,7 +725,7 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; GFX9-LABEL: fmul_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -746,7 +746,7 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; GFX11-LABEL: fmul_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 718be90eb75fc..9300dfcb16e8a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -22,7 +22,7 @@ declare half @llvm.fabs.f16(half) #1 define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmuladd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-DENORM-LABEL: fmuladd_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -62,7 +62,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -78,7 +78,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 @@ -92,7 +92,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-FLUSH-LABEL: fmuladd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -111,7 +111,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-DENORM-LABEL: fmuladd_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -136,7 +136,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -156,7 +156,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -176,7 +176,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-FLUSH-LABEL: fmul_fadd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -192,7 +192,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: s_clause 0x2 @@ -208,7 +208,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -222,7 +222,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-FLUSH-LABEL: fmul_fadd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -241,7 +241,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: s_clause 0x2 @@ -260,7 +260,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -286,7 +286,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_contract_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -306,7 +306,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; VI-DENORM-LABEL: fmul_fadd_contract_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -326,7 +326,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -342,7 +342,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 @@ -356,7 +356,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -375,7 +375,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -401,7 +401,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -437,7 +437,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -451,7 +451,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -464,7 +464,9 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -481,7 +483,9 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -509,7 +513,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -527,7 +531,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -545,7 +549,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -559,7 +563,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -572,7 +576,9 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -589,7 +595,9 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -617,7 +625,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_a_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -635,7 +643,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -653,7 +661,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -667,7 +675,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -681,7 +689,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -694,7 +702,9 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -711,7 +721,9 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -728,7 +740,9 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -759,7 +773,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_b_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -777,7 +791,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -795,7 +809,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -809,7 +823,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -823,7 +837,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -836,7 +850,9 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -853,7 +869,9 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -870,7 +888,9 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -901,7 +921,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -919,7 +939,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -937,7 +957,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -951,7 +971,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -964,7 +984,9 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -981,7 +1003,9 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1009,7 +1033,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1027,7 +1051,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1045,7 +1069,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1059,7 +1083,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1072,7 +1096,9 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1089,7 +1115,9 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1119,7 +1147,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1137,7 +1165,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1155,7 +1183,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1169,7 +1197,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1182,7 +1210,9 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1199,7 +1229,9 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1229,7 +1261,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1247,7 +1279,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1265,7 +1297,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1279,7 +1311,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1292,7 +1324,9 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1309,7 +1343,9 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1339,7 +1375,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1364,7 +1400,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1389,7 +1425,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-FLUSH-LABEL: mad_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1405,7 +1441,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1421,7 +1457,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1436,7 +1472,9 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-FLUSH-LABEL: mad_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1455,7 +1493,9 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1474,7 +1514,9 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1508,7 +1550,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1533,7 +1575,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1558,7 +1600,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: mad_sub_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1574,7 +1616,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1590,7 +1632,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1605,7 +1647,9 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-FLUSH-LABEL: mad_sub_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1624,7 +1668,9 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1643,7 +1689,9 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1677,7 +1725,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1702,7 +1750,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1727,7 +1775,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1743,7 +1791,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1759,7 +1807,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1774,7 +1822,9 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1793,7 +1843,9 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1812,7 +1864,9 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1847,7 +1901,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1872,7 +1926,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1897,7 +1951,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1913,7 +1967,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1929,7 +1983,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1944,7 +1998,9 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1963,7 +2019,9 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1982,7 +2040,9 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2017,7 +2077,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: neg_neg_mad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2042,7 +2102,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2067,7 +2127,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: neg_neg_mad_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2083,7 +2143,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2099,7 +2159,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2114,7 +2174,9 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-FLUSH-LABEL: neg_neg_mad_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2133,7 +2195,9 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2152,7 +2216,9 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2188,7 +2254,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_fabs_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2213,7 +2279,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2238,7 +2304,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2254,7 +2320,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2270,7 +2336,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2285,7 +2351,9 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2304,7 +2372,9 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2323,7 +2393,9 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2358,7 +2430,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2376,7 +2448,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2394,7 +2466,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2408,7 +2480,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2422,7 +2494,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2435,7 +2507,9 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2452,7 +2526,9 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2469,7 +2545,9 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2499,7 +2577,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2517,7 +2595,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2535,7 +2613,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2549,7 +2627,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2563,7 +2641,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2576,7 +2654,9 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2593,7 +2673,9 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2610,7 +2692,9 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index f411a76e75ab6..ba8b6fb80518f 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -15,8 +15,8 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; SI-LABEL: fnearbyint_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,23 +28,24 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; ; CI-LABEL: fnearbyint_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[0:1], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_load_dword s0, s[2:3], 0xb ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_rndne_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fnearbyint_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f16_e32 v2, s2 +; VI-NEXT: v_rndne_f16_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -53,11 +54,11 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; GFX11-LABEL: fnearbyint_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f16_e32 v1, s2 +; GFX11-NEXT: v_rndne_f16_e32 v1, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -70,8 +71,8 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; SICI-LABEL: fnearbyint_f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dword s4, s[0:1], 0xb -; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SICI-NEXT: s_load_dword s4, s[2:3], 0xb +; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -81,10 +82,10 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; ; VI-LABEL: fnearbyint_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f32_e32 v2, s2 +; VI-NEXT: v_rndne_f32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -93,11 +94,11 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; GFX11-LABEL: fnearbyint_f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v1, s2 +; GFX11-NEXT: v_rndne_f32_e32 v1, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -111,7 +112,7 @@ entry: define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v2f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SICI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SICI-NEXT: s_mov_b32 s7, 0xf000 ; SICI-NEXT: s_mov_b32 s6, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,7 +125,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fnearbyint_v2f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_rndne_f32_e32 v1, s3 @@ -135,7 +136,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; GFX11-LABEL: fnearbyint_v2f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v1, s3 @@ -153,8 +154,8 @@ entry: define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v4f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -167,8 +168,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fnearbyint_v4f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -182,8 +183,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; GFX11-LABEL: fnearbyint_v4f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v3, s7 @@ -203,7 +204,7 @@ entry: define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-LABEL: nearbyint_f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_brev_b32 s8, -2 @@ -227,7 +228,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; CI-LABEL: nearbyint_f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s3, 0xf000 @@ -237,7 +238,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: nearbyint_f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -247,7 +248,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; GFX11-LABEL: nearbyint_f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] @@ -263,41 +264,41 @@ entry: define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: nearbyint_v2f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_brev_b32 s10, -2 ; SI-NEXT: v_mov_b32_e32 v6, 0x43300000 ; SI-NEXT: s_mov_b32 s9, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, s3 +; SI-NEXT: v_mov_b32_e32 v7, s7 ; SI-NEXT: v_bfi_b32 v1, s10, v6, v7 -; SI-NEXT: v_mov_b32_e32 v8, s2 -; SI-NEXT: v_mov_b32_e32 v9, s1 -; SI-NEXT: v_mov_b32_e32 v10, s0 -; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1] +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1] ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1] +; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1] ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: nearbyint_v2f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -308,8 +309,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; ; VI-LABEL: nearbyint_v2f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] @@ -321,8 +322,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; GFX11-LABEL: nearbyint_v2f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] @@ -340,8 +341,8 @@ entry: define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: nearbyint_v4f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x11 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_brev_b32 s14, -2 @@ -390,8 +391,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; CI-LABEL: nearbyint_v4f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,8 +406,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; VI-LABEL: nearbyint_v4f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] ; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] @@ -425,8 +426,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; GFX11-LABEL: nearbyint_v4f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index b5440b9c38c9f..74e2b9ea71425 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2799,7 +2799,7 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -2813,7 +2813,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fneg_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3016,41 +3016,41 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s4, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_select_infloop_regression_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s4, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; VI-NEXT: s_cselect_b32 s2, 0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_cselect_b32 s0, 0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %i = select i1 %arg1, double 0.0, double %arg @@ -3080,11 +3080,11 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_bitcmp1_b32 s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_bitcmp1_b32 s4, 16 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] @@ -3096,11 +3096,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a ; ; VI-LABEL: s_fneg_select_infloop_regression_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s2, 16 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_bitcmp1_b32 s4, 16 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 @@ -3146,7 +3146,7 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s1, 1, s1 ; SI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3161,7 +3161,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, 1, s1 ; VI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3216,8 +3216,8 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s6, 0 @@ -3235,8 +3235,8 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s6, 0 @@ -3279,7 +3279,7 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fabs_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3293,7 +3293,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fabs_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3329,7 +3329,7 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_fabs_select_infloop_regression: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 ; ; VI-LABEL: s_fneg_fabs_select_infloop_regression: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 4364b32e62f8c..8267bb9f5450f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -7,12 +7,12 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -23,8 +23,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -36,8 +36,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fadd_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -49,13 +49,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2| +; GFX11-NEXT: v_sub_f16_e64 v1, s2, |s4| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -70,13 +70,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fmul_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s1, s0, 0x7fff ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s1| -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -87,8 +87,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fmul_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -100,8 +100,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fmul_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -113,13 +113,13 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fmul_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2| +; GFX11-NEXT: v_mul_f16_e64 v1, s2, -|s4| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -137,8 +137,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: fneg_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -149,8 +149,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: fneg_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -161,8 +161,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: fneg_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -173,10 +173,10 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: fneg_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -193,8 +193,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: fneg_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -205,8 +205,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fneg_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -217,8 +217,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -229,10 +229,10 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fneg_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -248,7 +248,7 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CIVI-LABEL: v_fneg_fabs_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -262,7 +262,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -273,7 +273,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_fneg_fabs_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -293,12 +293,12 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 ; CI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -314,8 +314,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; VI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -331,8 +331,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; GFX9-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -344,11 +344,11 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; GFX11-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -367,8 +367,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -379,8 +379,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: s_fneg_fabs_v2f16_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -391,8 +391,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -403,10 +403,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_or_b32 s2, s4, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -422,7 +422,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CIVI-LABEL: fneg_fabs_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -435,7 +435,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX9-LABEL: fneg_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -447,7 +447,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX11-LABEL: fneg_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -467,12 +467,12 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: fold_user_fneg_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 ; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -487,8 +487,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fold_user_fneg_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -503,8 +503,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: fold_user_fneg_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -515,11 +515,11 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: fold_user_fneg_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -536,8 +536,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -553,8 +553,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; VI-LABEL: s_fneg_multi_use_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -570,11 +570,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; GFX9-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX9-NEXT: s_xor_b32 s5, s4, 0x80008000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -585,8 +585,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; GFX11-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -609,8 +609,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 @@ -633,8 +633,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; VI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -654,11 +654,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; GFX9-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_pk_mul_f16 v1, s4, -4.0 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] @@ -668,8 +668,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll index 2c9042ec17da8..d0115523b1882 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, double %y) { ; SI-LABEL: fneg_fabs_fadd_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -20,8 +20,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, ; ; VI-LABEL: fneg_fabs_fadd_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -40,7 +40,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrspace(1) %xptr, ptr addrspace(1) %yptr) { ; SI-LABEL: v_fneg_fabs_fadd_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -52,7 +52,7 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: v_fneg_fabs_fadd_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -73,8 +73,8 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, double %y) { ; SI-LABEL: fneg_fabs_fmul_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -88,8 +88,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, ; ; VI-LABEL: fneg_fabs_fmul_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -108,7 +108,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: fneg_fabs_free_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -122,7 +122,7 @@ define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: fneg_fabs_free_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_or_b32 s0, s3, 0x80000000 @@ -174,8 +174,8 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %in) { ; SI-LABEL: fneg_fabs_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s5, 31 @@ -187,14 +187,14 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl ; ; VI-LABEL: fneg_fabs_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s3, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bitset1_b32 s1, 31 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %in) @@ -206,8 +206,8 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: fneg_fabs_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s7, 31 @@ -222,8 +222,8 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> % ; ; VI-LABEL: fneg_fabs_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s7, 0x80000000 ; VI-NEXT: s_or_b32 s3, s5, 0x80000000 @@ -244,8 +244,8 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> % define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: fneg_fabs_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -267,8 +267,8 @@ define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; VI-LABEL: fneg_fabs_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s7, 31 ; VI-NEXT: s_bitset1_b32 s5, 31 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 3c000d4fa63a3..6446145bbfe2a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fadd_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fadd_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_sub_f32_e64 v2, s3, |v0| @@ -36,7 +36,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fmul_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fmul_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0| @@ -67,11 +67,11 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_fabsf_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_or_b32 s4, s2, 0x80000000 +; SI-NEXT: s_bitset1_b32 s4, 31 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -79,10 +79,10 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_fabsf_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: s_or_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -129,11 +129,11 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_or_b32 s4, s2, 0x80000000 +; SI-NEXT: s_bitset1_b32 s4, 31 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -141,10 +141,10 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: s_or_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -159,7 +159,7 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fneg_fabsf_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -213,7 +213,7 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fneg_fabsf_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s3, 31 ; VI-NEXT: s_bitset1_b32 s2, 31 @@ -232,8 +232,8 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fneg_fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s7, 31 @@ -250,8 +250,8 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fneg_fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s7, 0x80000000 ; VI-NEXT: s_or_b32 s3, s6, 0x80000000 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index cd1ec85eb8d0f..63ccaafeda88f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1475,11 +1475,11 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_fo define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) { ; GFX7-LABEL: multiple_uses_fneg_select_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x4 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x6 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x4 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bitcmp1_b32 s6, 0 +; GFX7-NEXT: s_bitcmp1_b32 s8, 0 ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX7-NEXT: v_mov_b32_e32 v0, s3 @@ -1497,12 +1497,12 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; ; GFX9-LABEL: multiple_uses_fneg_select_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x18 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bitcmp1_b32 s6, 0 +; GFX9-NEXT: s_bitcmp1_b32 s8, 0 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -1519,13 +1519,13 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX11-LABEL: multiple_uses_fneg_select_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x18 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s8, s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x18 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s5 -; GFX11-NEXT: s_bitcmp1_b32 s2, 0 +; GFX11-NEXT: s_bitcmp1_b32 s8, 0 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s7, v0, vcc_lo @@ -1549,7 +1549,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) { ; GCN-LABEL: fnge_select_f32_multi_use_regression: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -1562,7 +1562,7 @@ define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) { ; ; GFX11-LABEL: fnge_select_f32_multi_use_regression: ; GFX11: ; %bb.0: ; %.entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, s0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 31c1389c94020..40982347f3ca0 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; CI-LABEL: s_fneg_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -20,8 +20,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; ; GFX8-LABEL: s_fneg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -32,8 +32,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; ; GFX9-LABEL: s_fneg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 @@ -44,10 +44,10 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; GFX11-LABEL: s_fneg_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -64,7 +64,7 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX8-LABEL: v_fneg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -92,7 +92,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_fneg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -103,7 +103,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX11-LABEL: v_fneg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -125,8 +127,8 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; CI-LABEL: s_fneg_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -137,8 +139,8 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; ; GFX8-LABEL: s_fneg_free_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -149,8 +151,8 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; ; GFX9-LABEL: s_fneg_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 @@ -161,10 +163,10 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; GFX11-LABEL: s_fneg_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -180,7 +182,7 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_fold_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -197,7 +199,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_fneg_fold_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -211,7 +213,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_fneg_fold_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -222,7 +224,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_fneg_fold_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -242,8 +244,8 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: s_fneg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -254,8 +256,8 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; ; GFX8-LABEL: s_fneg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -266,8 +268,8 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; ; GFX9-LABEL: s_fneg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 @@ -278,10 +280,10 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; GFX11-LABEL: s_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -296,7 +298,7 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; CIVI-LABEL: s_fneg_v2f16_nonload: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CIVI-NEXT: ;;#ASMSTART ; CIVI-NEXT: ; def s2 ; CIVI-NEXT: ;;#ASMEND @@ -310,7 +312,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: s_fneg_v2f16_nonload: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s2 ; GFX9-NEXT: ;;#ASMEND @@ -323,7 +325,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: s_fneg_v2f16_nonload: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s2 ; GFX11-NEXT: ;;#ASMEND @@ -345,7 +347,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -359,7 +361,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_fneg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -373,7 +375,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_fneg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -384,7 +386,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_fneg_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -406,8 +410,8 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-LABEL: fneg_free_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -418,8 +422,8 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX8-LABEL: fneg_free_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -430,8 +434,8 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX9-LABEL: fneg_free_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 @@ -442,10 +446,10 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-LABEL: fneg_free_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -461,7 +465,7 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -487,7 +491,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: v_fneg_fold_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -503,7 +507,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_fneg_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -514,7 +518,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_fneg_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -534,7 +538,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fneg_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -555,7 +559,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX8-LABEL: v_extract_fneg_fold_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -572,7 +576,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX9-LABEL: v_extract_fneg_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -588,7 +592,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX11-LABEL: v_extract_fneg_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] @@ -619,7 +623,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 { ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -635,7 +639,7 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX9-LABEL: v_extract_fneg_no_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] @@ -649,7 +653,7 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX11-LABEL: v_extract_fneg_no_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index d78bdfe08772a..e447429539e6f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,10 +19,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: s_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,10 +32,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: s_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -50,7 +50,7 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) { ; SI-LABEL: s_fneg_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -65,7 +65,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; VI-LABEL: s_fneg_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 @@ -78,7 +78,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; GFX11-LABEL: s_fneg_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 @@ -97,8 +97,8 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) { ; SI-LABEL: s_fneg_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -115,8 +115,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; ; VI-LABEL: s_fneg_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s7, 0x80000000 ; VI-NEXT: s_xor_b32 s3, s6, 0x80000000 @@ -134,8 +134,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; GFX11-LABEL: s_fneg_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000 @@ -157,8 +157,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fsub0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -168,10 +168,10 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fsub0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 0, s2 +; VI-NEXT: v_sub_f32_e64 v2, 0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -180,11 +180,11 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fsub0_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2 +; GFX11-NEXT: v_sub_f32_e64 v1, 0, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -198,8 +198,8 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,10 +210,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -223,10 +223,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fneg_free_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -242,8 +242,8 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fold_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -253,10 +253,10 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fold_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, -s2, s2 +; VI-NEXT: v_mul_f32_e64 v2, -s4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -265,11 +265,11 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: fneg_fold_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2 +; GFX11-NEXT: v_mul_f32_e64 v1, -s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -284,8 +284,8 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -295,10 +295,10 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; ; VI-LABEL: bitpreserve_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0 +; VI-NEXT: v_mul_f32_e64 v2, s4, -4.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -307,11 +307,11 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; GFX11-LABEL: bitpreserve_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v1, s4, -4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -327,8 +327,8 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -339,10 +339,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fneg_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -352,10 +352,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -380,8 +380,8 @@ define i32 @v_fneg_i32(i32 %in) { define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -391,10 +391,10 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fneg_i32_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2 +; VI-NEXT: v_sub_f32_e64 v2, 2.0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -403,11 +403,11 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2 +; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -434,7 +434,7 @@ define float @v_fneg_i32_fp_use(i32 %in) { define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -448,7 +448,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 @@ -460,7 +460,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -488,7 +488,7 @@ define i64 @v_fneg_i64(i64 %in) { define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -500,7 +500,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64_fp_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 @@ -550,23 +550,24 @@ define i16 @v_fneg_i16(i16 %in) { define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; SI-LABEL: s_fneg_i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2 +; VI-NEXT: v_sub_f16_e64 v2, 2.0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -575,11 +576,11 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fneg_i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2 +; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -619,8 +620,8 @@ define half @v_fneg_i16_fp_use(i16 %in) { define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -631,15 +632,15 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; ; VI-LABEL: s_fneg_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: s_xor_b32 s3, s4, 0x8000 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_xor_b32 s3, s3, 0x8000 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -649,10 +650,10 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; GFX11-LABEL: s_fneg_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -695,34 +696,35 @@ define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) { define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s3, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_lshr_b32 s1, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_xor_b32 s3, s3, 0x8000 +; VI-NEXT: s_lshr_b32 s2, s4, 16 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_f16_e64 v1, s2, 2.0 +; VI-NEXT: s_xor_b32 s3, s4, 0x8000 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_add_f16_e64 v1, s3, 2.0 ; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -733,11 +735,11 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; GFX11-LABEL: s_fneg_v2i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 37a201e390f81..65046681ffc20 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -8,13 +8,123 @@ declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i3 declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) +declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) +declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) +define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { +; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: s_endpgm + %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) + ret void +} + +define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { +; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) + ret void +} + +define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { +; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_rtn: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) + ret <2 x half> %ret +} + +define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { +; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -23,7 +133,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -62,7 +172,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -71,7 +181,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -110,7 +220,7 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] @@ -120,7 +230,7 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr ; ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 0746b93546124..cdfc8f48349f6 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -14,17 +14,17 @@ declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> % define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -37,7 +37,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -49,7 +49,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s0, 0 @@ -76,7 +76,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -88,7 +88,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s0, 0 @@ -180,17 +180,17 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -225,17 +225,17 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -270,17 +270,17 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[2:3] +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] @@ -316,7 +316,7 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -326,7 +326,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; ; GFX12-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: ds_pk_add_f16 v0, v1 @@ -364,7 +364,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -374,7 +374,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 @@ -409,298 +409,4 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ret <2 x i16> %ret } -define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 1023 - %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret float %result -} - -define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -256 - %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret float %result -} - -define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 1023 - %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret void -} - -define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -256 - %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret void -} - -define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 - %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret <2 x half> %result -} - -define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 - %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret <2 x half> %result -} - -define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 - %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret void -} - -define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 - %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret void -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 - %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret <2 x i16> %result -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 - %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret <2 x i16> %result -} - -define void @flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 - %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret void -} - -define void @flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 - %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret void -} - attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 18d2e52e8f900..fb731cc00d3f0 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -9,24 +9,24 @@ declare double @llvm.fabs.f64(double) #1 define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isinf_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x204 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isinf_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -36,11 +36,11 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f ; GFX11-LABEL: test_isinf_pattern: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -57,24 +57,24 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s0|, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cmp_nlg_f32_e64 s[4:5], |s4|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_not_isinf_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0 +; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s4|, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -84,11 +84,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % ; GFX11-LABEL: test_not_isinf_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2| +; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s4| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -105,7 +105,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -115,7 +115,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; VI-LABEL: test_not_isinf_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; GFX11-LABEL: test_not_isinf_pattern_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -142,24 +142,24 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -169,11 +169,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -192,24 +192,24 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -219,11 +219,11 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -241,8 +241,8 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -253,10 +253,10 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -266,11 +266,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -290,23 +290,23 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 -; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s6, s6 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 @@ -321,14 +321,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2 -; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -346,7 +346,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocapture %out, float %x, float %y) #0 { ; SI-LABEL: test_isfinite_not_pattern_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 @@ -362,7 +362,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 @@ -376,7 +376,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; GFX11-LABEL: test_isfinite_not_pattern_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 @@ -401,23 +401,23 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_u_f32_e64 s[0:1], s2, s2 -; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s2|, v0 -; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cmp_u_f32_e64 s[4:5], s6, s6 +; SI-NEXT: v_cmp_neq_f32_e64 s[6:7], |s6|, v0 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4 @@ -432,14 +432,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2 -; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2| +; GFX11-NEXT: v_cmp_u_f32_e64 s2, s4, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s4| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -457,24 +457,24 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -484,11 +484,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -507,24 +507,24 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_commute_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4_commute_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -534,11 +534,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) ; GFX11-LABEL: test_isfinite_pattern_4_commute_and: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -557,16 +557,16 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrspace(1) nocapture %out, float %x, [8 x i32], float %y) #0 { ; SI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[2:3], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_cmp_o_f32_e32 vcc, s0, v1 -; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s0, v0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_cmp_o_f32_e32 vcc, s1, v1 +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s1, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -574,14 +574,14 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; ; VI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x50 -; VI-NEXT: s_load_dword s5, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x50 +; VI-NEXT: s_load_dword s1, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_cmp_class_f32_e32 vcc, s5, v0 -; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s5, v1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s1, v0 +; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s1, v1 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] @@ -592,15 +592,15 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; GFX11-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x50 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x50 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3 -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s3, s4, 0x1f8 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -618,8 +618,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isinf_pattern_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -632,11 +632,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; ; VI-LABEL: test_isinf_pattern_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -646,11 +646,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; GFX11-LABEL: test_isinf_pattern_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -667,8 +667,8 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -684,11 +684,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -698,11 +698,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -721,8 +721,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -738,11 +738,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_4_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -752,11 +752,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_4_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll index 587340c7aa342..105d9246880a4 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -20,8 +20,8 @@ declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32 define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -30,8 +30,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,19 +41,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -63,8 +63,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -75,8 +75,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX12-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen @@ -86,8 +86,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -96,8 +96,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -107,19 +107,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -129,8 +129,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -242,14 +242,15 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 @@ -257,60 +258,48 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s2 -; GFX1030-NEXT: v_mov_b32_e32 v1, s3 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v1, s0 +; GFX1030-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v1, s6 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_clause 0x2 -; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c +; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc -; GFX1100-NEXT: v_mov_b32_e32 v1, s0 +; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s6 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm @@ -318,8 +307,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -331,14 +320,15 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 +; G_SI-NEXT: v_mov_b32_e32 v0, s0 +; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf +; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b32 v1, v0 @@ -346,14 +336,15 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b32 v1, v0 @@ -362,12 +353,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -377,14 +368,15 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; G_GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c +; G_GFX1030-NEXT: s_clause 0x1 +; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 @@ -392,13 +384,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x2 -; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c +; G_GFX1100-NEXT: s_clause 0x1 +; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: ds_store_b32 v1, v0 @@ -412,8 +405,8 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -422,8 +415,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -433,19 +426,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -455,8 +448,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -467,8 +460,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX12-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen @@ -478,8 +471,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -488,8 +481,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -499,19 +492,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -521,8 +514,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -634,7 +627,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -649,7 +642,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -664,7 +657,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -676,7 +669,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -688,7 +681,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -701,7 +694,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -715,7 +708,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -729,7 +722,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -743,7 +736,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -755,7 +748,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -767,7 +760,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll index e3ed0fa491884..e124aadf4e8c2 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll @@ -18,8 +18,8 @@ declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,8 +28,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -39,19 +39,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -61,8 +61,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -72,8 +72,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -82,8 +82,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -93,19 +93,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -115,8 +115,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -219,14 +219,15 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 @@ -234,74 +235,63 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s2 -; GFX1030-NEXT: v_mov_b32_e32 v1, s3 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v1, s0 +; GFX1030-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v1, s6 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_clause 0x2 -; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c +; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc -; GFX1100-NEXT: v_mov_b32_e32 v1, s0 +; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s6 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 +; G_SI-NEXT: v_mov_b32_e32 v0, s0 +; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf +; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b32 v1, v0 @@ -309,14 +299,15 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b32 v1, v0 @@ -325,12 +316,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -340,14 +331,15 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; G_GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c +; G_GFX1030-NEXT: s_clause 0x1 +; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 @@ -355,13 +347,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x2 -; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c +; G_GFX1100-NEXT: s_clause 0x1 +; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: ds_store_b32 v1, v0 @@ -376,8 +369,8 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -386,8 +379,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -397,19 +390,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -419,8 +412,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -430,8 +423,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -440,8 +433,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -451,19 +444,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -473,8 +466,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -577,7 +570,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -592,7 +585,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -607,7 +600,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -619,7 +612,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -631,7 +624,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -644,7 +637,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -658,7 +651,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -672,7 +665,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -684,7 +677,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -696,7 +689,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll index d827ea0503a3b..81859dce04889 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -10,7 +10,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp16_to_fp32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll index 03b8251ea4640..c17be87834aeb 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp16_to_fp64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll index 8ab82b722445e..d8a726f251a01 100644 --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -9,7 +9,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp32_to_fp16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp32_to_fp16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -45,7 +45,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp32_to_fp16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 5690b99e43ece..ce1fcccf4a17c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -26,22 +26,22 @@ declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -73,12 +73,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -88,12 +88,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -109,22 +109,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -156,12 +156,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -171,12 +171,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -192,22 +192,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -239,12 +239,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -254,12 +254,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -275,22 +275,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -322,12 +322,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -337,12 +337,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -358,22 +358,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -405,12 +405,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -420,12 +420,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -441,22 +441,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -488,12 +488,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -503,12 +503,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -524,22 +524,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -571,12 +571,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -586,12 +586,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -607,22 +607,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -654,12 +654,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -669,12 +669,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -690,22 +690,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -737,12 +737,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -752,12 +752,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -773,22 +773,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -820,12 +820,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -835,12 +835,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -856,22 +856,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -903,12 +903,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -918,12 +918,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -939,22 +939,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -986,12 +986,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1001,12 +1001,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -1022,7 +1022,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s2 @@ -1047,7 +1047,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 @@ -1057,7 +1057,7 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s2 @@ -1072,7 +1072,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s2 @@ -1097,23 +1097,23 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB39_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -1132,21 +1132,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB39_2: @@ -1159,20 +1159,20 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB40_2: @@ -1180,21 +1180,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB40_2: @@ -1207,23 +1207,23 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB41_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -1242,21 +1242,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB41_2: @@ -1269,20 +1269,20 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB42_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB42_2: @@ -1290,21 +1290,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB42_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB42_2: @@ -1479,23 +1479,23 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB49_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -1512,21 +1512,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB49_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB49_2: @@ -1539,7 +1539,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1564,7 +1564,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1581,7 +1581,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1593,7 +1593,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1610,7 +1610,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1636,7 +1636,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1760,7 +1760,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 @@ -1771,7 +1771,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -1806,7 +1806,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1846,7 +1846,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 @@ -1857,7 +1857,7 @@ define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -1892,7 +1892,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 @@ -1903,7 +1903,7 @@ define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -1938,16 +1938,16 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) { ; GFX90A-LABEL: local_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB63_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] @@ -1959,16 +1959,16 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do ; ; GFX940-LABEL: local_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB63_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX940-NEXT: s_load_dword s6, s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; GFX940-NEXT: s_load_dword s6, s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] @@ -2008,21 +2008,21 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX940-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_endpgm main_body: @@ -2056,19 +2056,19 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB67_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB67_2: @@ -2076,19 +2076,19 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB67_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB67_2: @@ -2101,19 +2101,19 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB68_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB68_2: @@ -2121,19 +2121,19 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB68_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB68_2: @@ -2146,19 +2146,19 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB69_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB69_2: @@ -2166,19 +2166,19 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB69_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB69_2: @@ -2256,264 +2256,6 @@ main_body: ret double %ret } -define double @flat_atomic_fadd_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - -define double @flat_atomic_fadd_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data) - ret double %ret -} - -define void @flat_atomic_fadd_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define void @flat_atomic_fadd_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data) - ret void -} - -define double @flat_atomic_fmin_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - -define double @flat_atomic_fmin_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %gep, double %data) - ret double %ret -} - -define void @flat_atomic_fmin_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_noret__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %unused = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define void @flat_atomic_fmin_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_noret__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %unused = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %gep, double %data) - ret void -} - -define double @flat_atomic_fmax_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - -define double @flat_atomic_fmax_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %gep, double %data) - ret double %ret -} - -define void @flat_atomic_fmax_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_noret__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %unused = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define void @flat_atomic_fmax_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_noret__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %unused = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %gep, double %data) - ret void -} - attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll index d610091840b95..f18f5752269e0 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,12 +41,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -54,9 +54,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -91,12 +91,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -104,9 +104,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -253,9 +253,9 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -278,12 +278,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -291,9 +291,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -328,12 +328,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -341,9 +341,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,7 +452,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -465,7 +465,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,7 +506,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -519,7 +519,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll index 5f501fec24c2e..6a2a8c3ce595d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,12 +41,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -54,9 +54,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -91,12 +91,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -104,9 +104,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -253,9 +253,9 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -278,12 +278,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -291,9 +291,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -328,12 +328,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -341,9 +341,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,7 +452,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -465,7 +465,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,7 +506,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -519,7 +519,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 04ef30bd26aa5..3571f3545ad1a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -8,8 +8,8 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,12 +19,12 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fp_to_sint_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,8 +47,8 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32_fabs: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -58,12 +58,12 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) ; ; VI-LABEL: fp_to_sint_i32_fabs: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e64 v0, |s2| ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_i32_f32_e64 v0, |s4| ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -87,7 +87,7 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_sint_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_sint_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: fp_to_sint_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -193,37 +193,37 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0x2f800000 -; SI-NEXT: s_mov_b32 s2, 0xcf800000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s5, 0x2f800000 +; SI-NEXT: s_mov_b32 s6, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s0 -; SI-NEXT: v_mul_f32_e64 v1, |v0|, s1 +; SI-NEXT: v_trunc_f32_e32 v0, s4 +; SI-NEXT: v_mul_f32_e64 v1, |v0|, s5 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; SI-NEXT: v_floor_f32_e32 v1, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v1 -; SI-NEXT: v_fma_f32 v0, v1, s2, |v0| +; SI-NEXT: v_fma_f32 v0, v1, s6, |v0| ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v2 ; SI-NEXT: v_xor_b32_e32 v0, v0, v2 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s4, 0x2f800000 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s2, 0x2f800000 ; VI-NEXT: s_mov_b32 s5, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s2 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4 +; VI-NEXT: v_trunc_f32_e32 v0, s4 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s2 ; VI-NEXT: v_floor_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v2, v1, s5, |v0| ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -294,7 +294,7 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 @@ -329,7 +329,7 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -452,17 +452,17 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_sint_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 ; SI-NEXT: s_mov_b32 s9, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s1 -; SI-NEXT: v_trunc_f32_e32 v1, s0 -; SI-NEXT: v_trunc_f32_e32 v2, s3 -; SI-NEXT: v_trunc_f32_e32 v3, s2 +; SI-NEXT: v_trunc_f32_e32 v0, s5 +; SI-NEXT: v_trunc_f32_e32 v1, s4 +; SI-NEXT: v_trunc_f32_e32 v2, s7 +; SI-NEXT: v_trunc_f32_e32 v3, s6 ; SI-NEXT: v_mul_f32_e64 v4, |v0|, s8 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; SI-NEXT: v_mul_f32_e64 v6, |v1|, s8 @@ -503,14 +503,14 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % ; SI-NEXT: v_subb_u32_e32 v7, vcc, v12, v9, vcc ; SI-NEXT: v_sub_i32_e32 v4, vcc, v13, v11 ; SI-NEXT: v_subb_u32_e32 v5, vcc, v8, v11, vcc -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s9, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -737,8 +737,8 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -749,8 +749,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -787,8 +787,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -799,8 +799,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -838,8 +838,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_sint_f32_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,12 +849,12 @@ define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) ; ; VI-LABEL: fp_to_sint_f32_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index 5abf82aa1aab5..c6b4e129bacbe 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -8,8 +8,8 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_uint_f32_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,12 +19,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % ; ; VI-LABEL: fp_to_uint_f32_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,7 +47,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -60,7 +60,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,7 +92,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -152,34 +152,34 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x) { ; SI-LABEL: fp_to_uint_f32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0xcf800000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s5, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s0 +; SI-NEXT: v_trunc_f32_e32 v0, s4 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_floor_f32_e32 v2, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v2 -; SI-NEXT: v_fma_f32 v0, v2, s1, v0 +; SI-NEXT: v_fma_f32 v0, v2, s5, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_f32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xcf800000 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s2, 0xcf800000 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s2 +; VI-NEXT: v_trunc_f32_e32 v0, s4 ; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; VI-NEXT: v_floor_f32_e32 v2, v1 -; VI-NEXT: v_fma_f32 v0, v2, s3, v0 +; VI-NEXT: v_fma_f32 v0, v2, s2, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -240,7 +240,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 @@ -264,7 +264,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -376,16 +376,16 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s1 -; SI-NEXT: v_trunc_f32_e32 v2, s0 -; SI-NEXT: v_trunc_f32_e32 v4, s3 -; SI-NEXT: v_trunc_f32_e32 v6, s2 +; SI-NEXT: v_trunc_f32_e32 v0, s5 +; SI-NEXT: v_trunc_f32_e32 v2, s4 +; SI-NEXT: v_trunc_f32_e32 v4, s7 +; SI-NEXT: v_trunc_f32_e32 v6, s6 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 @@ -406,14 +406,14 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x ; SI-NEXT: v_cvt_u32_f32_e32 v0, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v6, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v4, v9 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s2, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -619,8 +619,8 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -631,8 +631,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -669,8 +669,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -681,8 +681,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -720,8 +720,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,12 +731,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %i ; ; VI-LABEL: fp_to_uint_f32_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 82c25c01b1779..8c6dc4395839c 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; SI-LABEL: fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; ; GFX89-LABEL: fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; ; GFX11-LABEL: fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @fpext_f16_to_f64( ; SI-LABEL: fpext_f16_to_f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; ; GFX89-LABEL: fpext_f16_to_f64: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; ; GFX11-LABEL: fpext_f16_to_f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -141,7 +141,7 @@ entry: define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; SI-LABEL: fpext_v2f16_to_v2f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -161,7 +161,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; ; GFX89-LABEL: fpext_v2f16_to_v2f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; ; GFX11-LABEL: fpext_v2f16_to_v2f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -212,7 +212,7 @@ entry: define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; SI-LABEL: fpext_v2f16_to_v2f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -234,7 +234,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; ; GFX89-LABEL: fpext_v2f16_to_v2f64: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -255,7 +255,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; ; GFX11-LABEL: fpext_v2f16_to_v2f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -290,46 +290,35 @@ entry: define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) { ; SI-LABEL: s_fneg_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_fneg_fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: s_fneg_fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: s_fneg_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -345,7 +334,7 @@ entry: define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; SI-LABEL: fneg_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -363,7 +352,7 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -381,7 +370,7 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -411,7 +400,7 @@ entry: define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; SI-LABEL: fabs_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -429,7 +418,7 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -447,7 +436,7 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -477,7 +466,7 @@ entry: define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; SI-LABEL: fneg_fabs_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -495,7 +484,7 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -513,7 +502,7 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -546,7 +535,7 @@ entry: define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; SI-LABEL: fneg_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -568,7 +557,7 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -590,7 +579,7 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -625,7 +614,7 @@ entry: define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -649,7 +638,7 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -671,7 +660,7 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -707,7 +696,7 @@ entry: define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; SI-LABEL: fabs_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -729,7 +718,7 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -751,7 +740,7 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -786,7 +775,7 @@ entry: define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -810,7 +799,7 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -832,7 +821,7 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -868,7 +857,7 @@ entry: define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; SI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -890,7 +879,7 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -912,7 +901,7 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -948,7 +937,7 @@ entry: define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -972,7 +961,7 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -994,7 +983,7 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1031,3 +1020,6 @@ entry: declare half @llvm.fabs.f16(half) #1 attributes #1 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9: {{.*}} +; VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index 238010ec05e4d..0e12cca1900ce 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; SI-LABEL: fptosi_f16_to_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; ; VI-LABEL: fptosi_f16_to_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; ; GFX11-LABEL: fptosi_f16_to_i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i32( ; SI-LABEL: fptosi_f16_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; ; VI-LABEL: fptosi_f16_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; ; GFX11-LABEL: fptosi_f16_to_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -144,7 +144,7 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i64( ; SI-LABEL: fptosi_f16_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -164,7 +164,7 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; ; VI-LABEL: fptosi_f16_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -184,7 +184,7 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; ; GFX11-LABEL: fptosi_f16_to_i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -216,7 +216,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; SI-LABEL: fptosi_v2f16_to_v2i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -241,7 +241,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; ; VI-LABEL: fptosi_v2f16_to_v2i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -261,7 +261,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; ; GFX11-LABEL: fptosi_v2f16_to_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -296,7 +296,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; SI-LABEL: fptosi_v2f16_to_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -318,7 +318,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; ; VI-LABEL: fptosi_v2f16_to_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; ; GFX11-LABEL: fptosi_v2f16_to_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -377,7 +377,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; SI-LABEL: fptosi_v2f16_to_v2i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; ; VI-LABEL: fptosi_v2f16_to_v2i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -424,7 +424,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; ; GFX11-LABEL: fptosi_v2f16_to_v2i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -462,8 +462,8 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; SI-LABEL: fptosi_f16_to_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -475,8 +475,8 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fptosi_f16_to_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -488,11 +488,11 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fptosi_f16_to_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2 +; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index 1116dc9ae2e5b..abc5c7af13b0c 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; SI-LABEL: fptoui_f16_to_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; ; VI-LABEL: fptoui_f16_to_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; ; GFX11-LABEL: fptoui_f16_to_i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i32( ; SI-LABEL: fptoui_f16_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; ; VI-LABEL: fptoui_f16_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; ; GFX11-LABEL: fptoui_f16_to_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -144,7 +144,7 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i64( ; SI-LABEL: fptoui_f16_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -164,7 +164,7 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; ; VI-LABEL: fptoui_f16_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -184,7 +184,7 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; ; GFX11-LABEL: fptoui_f16_to_i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -216,7 +216,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; SI-LABEL: fptoui_v2f16_to_v2i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -240,7 +240,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; ; VI-LABEL: fptoui_v2f16_to_v2i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -260,7 +260,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; ; GFX11-LABEL: fptoui_v2f16_to_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -295,7 +295,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; SI-LABEL: fptoui_v2f16_to_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -317,7 +317,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; ; VI-LABEL: fptoui_v2f16_to_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -338,7 +338,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; ; GFX11-LABEL: fptoui_v2f16_to_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -376,7 +376,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; SI-LABEL: fptoui_v2f16_to_v2i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -400,7 +400,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; ; VI-LABEL: fptoui_v2f16_to_v2i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -423,7 +423,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; ; GFX11-LABEL: fptoui_v2f16_to_v2i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -460,21 +460,22 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; SI-LABEL: fptoui_f16_to_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 1.0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fptoui_f16_to_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -486,11 +487,11 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fptoui_f16_to_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2 +; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 6cc7368eeae61..65ac2e240469d 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -41,7 +41,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -59,7 +59,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -71,7 +71,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -89,7 +89,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -101,7 +101,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -121,7 +121,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -144,7 +144,7 @@ entry: define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-SDAG-LABEL: fptrunc_f64_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -163,7 +163,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; SI-GISEL-LABEL: fptrunc_f64_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -176,7 +176,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; VI-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -195,7 +195,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; VI-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -208,7 +208,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX9-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -227,7 +227,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX9-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -240,7 +240,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX11-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -262,7 +262,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -287,7 +287,7 @@ entry: define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; SI-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -308,7 +308,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; SI-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -323,7 +323,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; VI-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -343,7 +343,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; VI-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -358,7 +358,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -378,7 +378,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX9-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -392,7 +392,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -415,7 +415,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX11-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -441,7 +441,7 @@ entry: define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -464,7 +464,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -481,7 +481,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -503,7 +503,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -519,7 +519,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -541,7 +541,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 @@ -557,7 +557,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -584,7 +584,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 @@ -613,7 +613,7 @@ entry: define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -631,7 +631,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -643,7 +643,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -661,7 +661,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -673,7 +673,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -691,7 +691,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -703,7 +703,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -723,7 +723,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -747,7 +747,7 @@ entry: define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -765,7 +765,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -777,7 +777,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -795,7 +795,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -807,7 +807,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -825,7 +825,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -837,7 +837,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -857,7 +857,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -881,7 +881,7 @@ entry: define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -899,7 +899,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -911,7 +911,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -929,7 +929,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -941,7 +941,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -959,7 +959,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -971,7 +971,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -991,7 +991,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1016,7 +1016,7 @@ entry: define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; SI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1034,7 +1034,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1076,7 +1076,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1106,7 +1106,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1128,7 +1128,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1155,7 +1155,7 @@ entry: define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; SI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; SI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; VI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1203,7 +1203,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; VI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1215,7 +1215,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX9-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1245,7 +1245,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1267,7 +1267,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1295,7 +1295,7 @@ entry: define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; SI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1314,7 +1314,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1327,7 +1327,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1359,7 +1359,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1378,7 +1378,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1391,7 +1391,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1413,7 +1413,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index e4aa4d1d3ddb5..1ba5e8f916cba 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; VI-SDAG-LABEL: fptrunc_f64_to_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -40,7 +40,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; VI-GISEL-LABEL: fptrunc_f64_to_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -50,7 +50,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX10-SDAG-LABEL: fptrunc_f64_to_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -60,7 +60,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX10-GISEL-LABEL: fptrunc_f64_to_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 @@ -70,7 +70,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX11-SDAG-LABEL: fptrunc_f64_to_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -82,7 +82,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 @@ -99,7 +99,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -159,7 +159,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-SAFE-SDAG: ; %bb.0: -; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -218,7 +218,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-SAFE-GISEL: ; %bb.0: -; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; VI-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8 @@ -270,7 +270,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-UNSAFE-SDAG: ; %bb.0: -; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 @@ -281,7 +281,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-UNSAFE-GISEL: ; %bb.0: -; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -292,7 +292,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX10-SAFE-SDAG: ; %bb.0: -; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff ; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 @@ -348,7 +348,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX10-SAFE-GISEL: ; %bb.0: -; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 @@ -400,7 +400,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX10-UNSAFE-SDAG: ; %bb.0: -; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -411,7 +411,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX10-UNSAFE-GISEL: ; %bb.0: -; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -422,7 +422,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-SAFE-SDAG: ; %bb.0: -; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff ; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 @@ -489,7 +489,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-SAFE-GISEL: ; %bb.0: -; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 @@ -548,7 +548,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-UNSAFE-SDAG: ; %bb.0: -; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -562,7 +562,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-UNSAFE-GISEL: ; %bb.0: -; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -582,8 +582,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: fptrunc_v2f64_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,8 +594,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -606,8 +606,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; ; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -619,8 +619,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -632,8 +632,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -645,8 +645,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -660,8 +660,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -679,37 +679,37 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x double> %in) { ; SI-LABEL: fptrunc_v3f64_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] -; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] -; SI-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x54 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] ; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -721,24 +721,25 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; ; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x2 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] ; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -750,16 +751,17 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; ; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x2 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x44 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -768,8 +770,8 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -788,8 +790,8 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: fptrunc_v4f64_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -802,8 +804,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; ; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -816,8 +818,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; ; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -831,8 +833,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -846,8 +848,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -861,8 +863,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -878,8 +880,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -899,8 +901,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x double> %in) { ; SI-LABEL: fptrunc_v8f64_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -918,8 +920,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; ; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -937,8 +939,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; ; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -957,8 +959,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -977,8 +979,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -997,8 +999,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1020,8 +1022,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 0d59021b69019..7c5d73ab66b47 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -51,8 +51,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -92,8 +92,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 8 @@ -120,12 +120,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 @@ -139,13 +139,13 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-LABEL: frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 @@ -159,8 +159,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-LABEL: frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -184,8 +184,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-LABEL: frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -218,8 +218,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -247,8 +247,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -276,8 +276,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 8 @@ -299,12 +299,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v3, v2 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -316,13 +316,13 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: fast_frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v3, v2 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -334,8 +334,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-LABEL: fast_frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -356,8 +356,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-LABEL: fast_frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -387,8 +387,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -416,8 +416,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -445,8 +445,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 8 @@ -468,12 +468,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v3, v2 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -485,13 +485,13 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX10-LABEL: unsafe_frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v3, v2 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -503,8 +503,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-LABEL: unsafe_frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -525,8 +525,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-LABEL: unsafe_frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -556,8 +556,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -592,8 +592,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -628,8 +628,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -662,12 +662,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 @@ -690,13 +690,13 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-LABEL: frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 @@ -719,8 +719,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-LABEL: frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -756,8 +756,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-LABEL: frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -802,8 +802,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -827,8 +827,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -852,8 +852,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -875,12 +875,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f32_e32 v3, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -892,13 +892,13 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: fast_frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -910,8 +910,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-LABEL: fast_frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -932,8 +932,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-LABEL: fast_frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -963,8 +963,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -988,8 +988,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1013,8 +1013,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1036,12 +1036,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f32_e32 v3, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -1053,13 +1053,13 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX10-LABEL: unsafe_frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -1071,8 +1071,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX11-LABEL: unsafe_frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1093,8 +1093,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-LABEL: unsafe_frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1124,8 +1124,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1182,8 +1182,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1217,8 +1217,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -1248,12 +1248,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -1275,13 +1275,13 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-LABEL: frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -1302,8 +1302,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-LABEL: frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1337,8 +1337,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-LABEL: frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v12, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1379,8 +1379,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1430,8 +1430,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1461,8 +1461,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -1488,12 +1488,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1511,13 +1511,13 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: fast_frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1535,8 +1535,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-LABEL: fast_frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1566,8 +1566,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-LABEL: fast_frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v10, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1604,8 +1604,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1655,8 +1655,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1686,8 +1686,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -1713,12 +1713,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1736,13 +1736,13 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX10-LABEL: unsafe_frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1760,8 +1760,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX11-LABEL: unsafe_frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1791,8 +1791,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-LABEL: unsafe_frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v10, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1829,8 +1829,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1892,8 +1892,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -1955,8 +1955,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1995,12 +1995,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 @@ -2023,13 +2023,13 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 @@ -2052,8 +2052,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2090,8 +2090,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v2f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -2139,8 +2139,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2242,8 +2242,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -2345,8 +2345,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -2405,12 +2405,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 @@ -2448,13 +2448,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5 @@ -2492,8 +2492,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2552,8 +2552,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v4f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -2625,8 +2625,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2676,8 +2676,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -2727,8 +2727,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -2776,12 +2776,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 @@ -2819,13 +2819,13 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 @@ -2863,8 +2863,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2921,8 +2921,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v2f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -2989,8 +2989,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3070,8 +3070,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -3151,8 +3151,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -3230,12 +3230,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 @@ -3303,13 +3303,13 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 ; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 @@ -3377,8 +3377,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3477,8 +3477,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v4f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v8, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -3589,8 +3589,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3681,8 +3681,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -3730,8 +3730,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -3777,12 +3777,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] @@ -3818,13 +3818,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] @@ -3858,8 +3858,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3912,8 +3912,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v2f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v16, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 4ea3323a9dbfc..ea588df86b846 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -13,51 +13,49 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind rea define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: s_not_b32 s5, s8 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s6, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_lshr_b32 s5, s4, 1 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; SI-NEXT: s_not_b32 s4, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_alignbit_b32 v0, s5, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s0, s0 -; VI-NEXT: s_lshr_b32 s1, s6, 1 -; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_not_b32 s3, s6 +; VI-NEXT: s_lshr_b32 s2, s4, 1 +; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_not_b32 s1, s2 -; GFX9-NEXT: s_lshr_b32 s0, s6, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_not_b32 s3, s6 +; GFX9-NEXT: s_lshr_b32 s2, s4, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32: @@ -77,30 +75,30 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshl_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, 1 -; GFX10-NEXT: s_lshr_b32 s0, s6, 1 -; GFX10-NEXT: s_not_b32 s1, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, 1 +; GFX10-NEXT: s_lshr_b32 s2, s4, 1 +; GFX10-NEXT: s_not_b32 s3, s6 +; GFX10-NEXT: v_alignbit_b32 v0, s2, v0, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s6, s7, 1 -; GFX11-NEXT: s_lshr_b32 s1, s6, 1 -; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, 1 +; GFX11-NEXT: s_lshr_b32 s2, s4, 1 +; GFX11-NEXT: s_not_b32 s3, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s1, v0, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_alignbit_b32 v0, s2, v0, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -113,7 +111,7 @@ entry: define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshl_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,7 +124,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshl_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 @@ -137,7 +135,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -159,7 +157,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 @@ -168,7 +166,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 @@ -185,15 +183,15 @@ entry: define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 ; SI-NEXT: s_not_b32 s1, s1 +; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 ; SI-NEXT: s_lshr_b32 s2, s5, 1 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 @@ -208,47 +206,47 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s3, s3 +; VI-NEXT: s_not_b32 s1, s1 ; VI-NEXT: s_lshr_b32 s7, s5, 1 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_not_b32 s0, s0 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s3, s4, 1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_lshr_b32 s1, s4, 1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_alignbit_b32 v0, s1, v0, v2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s0, s5, 1 -; GFX9-NEXT: s_not_b32 s1, s9 +; GFX9-NEXT: s_lshr_b32 s2, s5, 1 +; GFX9-NEXT: s_not_b32 s3, s9 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_not_b32 s1, s8 +; GFX9-NEXT: s_not_b32 s3, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -272,39 +270,39 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s5, s7, 1 ; GFX10-NEXT: v_alignbit_b32 v3, s4, s6, 1 -; GFX10-NEXT: s_lshr_b32 s0, s5, 1 -; GFX10-NEXT: s_not_b32 s1, s3 +; GFX10-NEXT: s_lshr_b32 s2, s5, 1 +; GFX10-NEXT: s_not_b32 s1, s1 ; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s2 +; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: v_alignbit_b32 v1, s2, v0, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1 ; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1 ; GFX11-NEXT: s_lshr_b32 s5, s5, 1 -; GFX11-NEXT: s_not_b32 s3, s3 +; GFX11-NEXT: s_not_b32 s1, s1 ; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_not_b32 s2, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s1 +; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -317,8 +315,8 @@ entry: define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshl_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -331,8 +329,8 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; VI-LABEL: fshl_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -345,15 +343,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; GFX9-LABEL: fshl_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -373,20 +371,20 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshl_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 23 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 25 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 @@ -404,44 +402,44 @@ entry: define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x15 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_not_b32 s1, s19 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: s_not_b32 s11, s15 ; SI-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; SI-NEXT: s_lshr_b32 s7, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 +; SI-NEXT: s_lshr_b32 s0, s7, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_alignbit_b32 v3, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_not_b32 s7, s14 +; SI-NEXT: s_not_b32 s1, s18 ; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; SI-NEXT: s_lshr_b32 s6, s6, 1 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 +; SI-NEXT: s_lshr_b32 s0, s6, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_alignbit_b32 v2, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_not_b32 s6, s13 +; SI-NEXT: s_not_b32 s1, s17 ; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; SI-NEXT: s_lshr_b32 s5, s5, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 +; SI-NEXT: s_lshr_b32 s0, s5, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_not_b32 s5, s12 +; SI-NEXT: s_not_b32 s1, s16 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s4, 1 -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_lshr_b32 s0, s4, 1 +; SI-NEXT: v_mov_b32_e32 v4, s1 +; SI-NEXT: v_alignbit_b32 v0, s0, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: s_not_b32 s3, s15 @@ -474,36 +472,36 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_not_b32 s1, s15 +; GFX9-NEXT: s_not_b32 s3, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_lshr_b32 s0, s7, 1 +; GFX9-NEXT: s_lshr_b32 s2, s7, 1 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v3, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: s_not_b32 s1, s14 +; GFX9-NEXT: s_not_b32 s3, s14 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; GFX9-NEXT: s_lshr_b32 s2, s6, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v2, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: s_not_b32 s1, s13 +; GFX9-NEXT: s_not_b32 s3, s13 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s5, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; GFX9-NEXT: s_lshr_b32 s2, s5, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_not_b32 s1, s12 +; GFX9-NEXT: s_not_b32 s3, s12 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: @@ -534,11 +532,11 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX10-LABEL: fshl_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 @@ -562,9 +560,9 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-LABEL: fshl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1 @@ -596,10 +594,10 @@ entry: define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshl_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -609,13 +607,13 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 25 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 31 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -632,9 +630,9 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 @@ -668,22 +666,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshl_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31 @@ -704,7 +702,7 @@ entry: define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: orxor2or1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -720,7 +718,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; VI-LABEL: orxor2or1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s2, 7 ; VI-NEXT: s_or_b32 s4, s3, s4 @@ -734,7 +732,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX9-LABEL: orxor2or1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s4, s2, 7 @@ -761,7 +759,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX10-LABEL: orxor2or1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s4, s2, 7 @@ -774,7 +772,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX11-LABEL: orxor2or1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s2, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index e8310e73f9a47..dbcebe6e07e3f 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -22,42 +22,40 @@ declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_alignbit_b32 v0, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32: @@ -74,24 +72,24 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshr_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0 +; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -105,7 +103,7 @@ entry: define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshr_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -118,7 +116,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshr_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 @@ -129,7 +127,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -151,7 +149,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 @@ -160,7 +158,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 @@ -177,9 +175,9 @@ entry: define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -194,33 +192,33 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm @@ -242,13 +240,13 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] @@ -257,16 +255,16 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-LABEL: fshr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -279,8 +277,8 @@ entry: define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -293,8 +291,8 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; VI-LABEL: fshr_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -307,15 +305,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; GFX9-LABEL: fshr_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -335,20 +333,20 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshr_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 9 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 7 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 @@ -366,11 +364,11 @@ entry: define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x15 +; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s19, 0xf000 +; SI-NEXT: s_mov_b32 s18, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s15 @@ -384,14 +382,14 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v4, s12 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s15 @@ -412,10 +410,10 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 @@ -453,9 +451,9 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s15 @@ -466,15 +464,15 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, v1 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v4 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 @@ -498,10 +496,10 @@ entry: define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshr_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -511,13 +509,13 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 7 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -534,9 +532,9 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 @@ -568,22 +566,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshr_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1 diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index 8779bb0df0f71..8fd201038ad16 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -3975,15 +3975,15 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) { define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_neg0: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 ; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 @@ -4005,18 +4005,17 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; ; GISEL-IEEE-LABEL: elim_redun_check_neg0: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4032,24 +4031,25 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_neg0: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 -; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -4063,22 +4063,24 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 ; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SDAG-DAZ-NEXT: s_endpgm ; ; GISEL-DAZ-LABEL: elim_redun_check_neg0: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4093,9 +4095,8 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4109,15 +4110,15 @@ entry: define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_pos0: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 ; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 @@ -4139,18 +4140,17 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; ; GISEL-IEEE-LABEL: elim_redun_check_pos0: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4165,24 +4165,25 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, s2, 0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, s6, 0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_pos0: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 -; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -4196,22 +4197,24 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 ; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SDAG-DAZ-NEXT: s_endpgm ; ; GISEL-DAZ-LABEL: elim_redun_check_pos0: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4225,9 +4228,8 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, s2, 0 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, s4, 0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4241,15 +4243,15 @@ entry: define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 ; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 @@ -4271,18 +4273,17 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; ; GISEL-IEEE-LABEL: elim_redun_check_ult: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4298,24 +4299,25 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, s6, v1 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_ult: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 -; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 -; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -4329,22 +4331,24 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 ; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SDAG-DAZ-NEXT: s_endpgm ; ; GISEL-DAZ-LABEL: elim_redun_check_ult: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4359,9 +4363,8 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, s4, v1 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4375,7 +4378,7 @@ entry: define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4423,7 +4426,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; GISEL-IEEE-LABEL: elim_redun_check_v2: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) @@ -4475,7 +4478,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; SDAG-DAZ-LABEL: elim_redun_check_v2: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-DAZ-NEXT: s_mov_b32 s7, 0xf000 @@ -4521,7 +4524,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; GISEL-DAZ-LABEL: elim_redun_check_v2: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) @@ -4579,7 +4582,7 @@ entry: define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4627,7 +4630,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; GISEL-IEEE-LABEL: elim_redun_check_v2_ult: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) @@ -4679,7 +4682,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; SDAG-DAZ-LABEL: elim_redun_check_v2_ult: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-DAZ-NEXT: s_mov_b32 s7, 0xf000 @@ -4725,7 +4728,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; GISEL-DAZ-LABEL: elim_redun_check_v2_ult: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll index f72d4e0e03633..f6df1cbbdd06b 100644 --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -7,58 +7,58 @@ define amdgpu_kernel void @fsub_f16( ; SI-LABEL: fsub_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; GFX89-LABEL: fsub_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_mov_b32 s14, s2 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_mov_b32 s11, 0xf000 +; GFX89-NEXT: s_mov_b32 s10, -1 +; GFX89-NEXT: s_mov_b32 s14, s10 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s12, s6 ; GFX89-NEXT: s_mov_b32 s13, s7 -; GFX89-NEXT: s_mov_b32 s15, s3 -; GFX89-NEXT: s_mov_b32 s10, s2 -; GFX89-NEXT: s_mov_b32 s11, s3 +; GFX89-NEXT: s_mov_b32 s15, s11 +; GFX89-NEXT: s_mov_b32 s2, s10 +; GFX89-NEXT: s_mov_b32 s3, s11 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; GFX89-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s0, s4 -; GFX89-NEXT: s_mov_b32 s1, s5 +; GFX89-NEXT: s_mov_b32 s8, s4 +; GFX89-NEXT: s_mov_b32 s9, s5 ; GFX89-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -93,7 +93,7 @@ entry: define amdgpu_kernel void @fsub_f16_imm_a( ; SI-LABEL: fsub_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -113,7 +113,7 @@ define amdgpu_kernel void @fsub_f16_imm_a( ; ; GFX89-LABEL: fsub_f16_imm_a: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @fsub_f16_imm_a( ; ; GFX11-LABEL: fsub_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -160,7 +160,7 @@ entry: define amdgpu_kernel void @fsub_f16_imm_b( ; SI-LABEL: fsub_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @fsub_f16_imm_b( ; ; GFX89-LABEL: fsub_f16_imm_b: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -198,7 +198,7 @@ define amdgpu_kernel void @fsub_f16_imm_b( ; ; GFX11-LABEL: fsub_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -227,21 +227,21 @@ entry: define amdgpu_kernel void @fsub_v2f16( ; SI-LABEL: fsub_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -256,60 +256,60 @@ define amdgpu_kernel void @fsub_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fsub_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fsub_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s3 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -343,7 +343,7 @@ entry: define amdgpu_kernel void @fsub_v2f16_imm_a( ; SI-LABEL: fsub_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; VI-LABEL: fsub_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -390,7 +390,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; GFX9-LABEL: fsub_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -409,7 +409,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; GFX11-LABEL: fsub_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -438,7 +438,7 @@ entry: define amdgpu_kernel void @fsub_v2f16_imm_b( ; SI-LABEL: fsub_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -464,7 +464,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; VI-LABEL: fsub_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -485,7 +485,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; GFX9-LABEL: fsub_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -504,7 +504,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; GFX11-LABEL: fsub_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 44a9127b4bd09..8846068e750d4 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -6,7 +6,7 @@ define void @void_func_i1_inreg(i1 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i1_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_and_b32 s4, s6, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -28,7 +28,7 @@ define void @void_func_i8_inreg(i8 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i8_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -47,7 +47,7 @@ define void @void_func_i16_inreg(i16 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -66,7 +66,7 @@ define void @void_func_i32_inreg(i32 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -85,8 +85,8 @@ define void @void_func_i64_inreg(i64 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -105,7 +105,7 @@ define void @void_func_f16_inreg(half inreg %arg0) #0 { ; GFX9-LABEL: void_func_f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -124,7 +124,7 @@ define void @void_func_f32_inreg(float inreg %arg0) #0 { ; GFX9-LABEL: void_func_f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -143,8 +143,8 @@ define void @void_func_f64_inreg(double inreg %arg0) #0 { ; GFX9-LABEL: void_func_f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -163,7 +163,7 @@ define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -182,9 +182,9 @@ define void @void_func_v3i16_inreg(<3 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -214,8 +214,8 @@ define void @void_func_v4i16_inreg(<4 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -234,10 +234,10 @@ define void @void_func_v5i16_inreg(<5 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -259,10 +259,10 @@ define void @void_func_v8i16_inreg(<8 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -282,8 +282,8 @@ define void @void_func_v2i32_inreg(<2 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -302,9 +302,9 @@ define void @void_func_v3i32_inreg(<3 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: global_store_dwordx3 v[0:1], v[0:2], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -324,10 +324,10 @@ define void @void_func_v4i32_inreg(<4 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -347,12 +347,12 @@ define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 ; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -360,7 +360,7 @@ define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v5i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_clause 0x1 @@ -375,16 +375,16 @@ define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -392,8 +392,8 @@ define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -437,28 +437,28 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s28 +; GFX9-NEXT: v_mov_b32_e32 v3, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s25 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -466,12 +466,12 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 -; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 -; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 -; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 -; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 -; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 +; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 +; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -488,47 +488,33 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v32i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s28 -; GFX9-NEXT: v_mov_b32_e32 v5, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off -; GFX9-NEXT: v_mov_b32_e32 v0, s24 -; GFX9-NEXT: v_mov_b32_e32 v1, s25 -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 +; GFX9-NEXT: v_mov_b32_e32 v0, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s28 +; GFX9-NEXT: v_mov_b32_e32 v3, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s25 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -536,24 +522,18 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v32i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25 -; GFX11-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 -; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 -; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 -; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 -; GFX11-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 +; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 +; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 +; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 +; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 +; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 +; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 @@ -571,10 +551,10 @@ define void @void_func_v2i64_inreg(<2 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -594,13 +574,13 @@ define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -608,7 +588,7 @@ define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v3i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x1 @@ -623,16 +603,16 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -640,8 +620,8 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v4i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -656,20 +636,20 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -677,11 +657,11 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v5i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 -; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off @@ -695,28 +675,28 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s28 +; GFX9-NEXT: v_mov_b32_e32 v3, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s25 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -724,12 +704,12 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 -; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 -; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 -; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 -; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 -; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 +; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 +; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -746,47 +726,33 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s28 -; GFX9-NEXT: v_mov_b32_e32 v5, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off -; GFX9-NEXT: v_mov_b32_e32 v0, s24 -; GFX9-NEXT: v_mov_b32_e32 v1, s25 -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s28 +; GFX9-NEXT: v_mov_b32_e32 v3, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s25 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -794,24 +760,18 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25 -; GFX11-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 -; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 -; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 -; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 -; GFX11-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 +; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 +; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 +; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 +; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 +; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 +; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 @@ -829,7 +789,7 @@ define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -848,9 +808,9 @@ define void @void_func_v3f16_inreg(<3 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -871,8 +831,8 @@ define void @void_func_v4f16_inreg(<4 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -891,10 +851,10 @@ define void @void_func_v8f16_inreg(<8 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -914,16 +874,16 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -931,8 +891,8 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -947,8 +907,8 @@ define void @void_func_v2f32_inreg(<2 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -967,9 +927,9 @@ define void @void_func_v3f32_inreg(<3 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: global_store_dwordx3 v[0:1], v[0:2], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -989,10 +949,10 @@ define void @void_func_v4f32_inreg(<4 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1012,16 +972,16 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1029,8 +989,8 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1045,28 +1005,28 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s28 +; GFX9-NEXT: v_mov_b32_e32 v3, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s25 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1074,12 +1034,12 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 -; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 -; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 -; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 -; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 -; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 +; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 +; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -1096,10 +1056,10 @@ define void @void_func_v2f64_inreg(<2 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1119,13 +1079,13 @@ define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1133,7 +1093,7 @@ define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v3f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1148,16 +1108,16 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1165,8 +1125,8 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v4f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1181,28 +1141,28 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s28 +; GFX9-NEXT: v_mov_b32_e32 v3, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s25 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1210,12 +1170,12 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 -; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 -; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 -; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 -; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 -; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 +; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 +; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -1232,47 +1192,33 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s28 -; GFX9-NEXT: v_mov_b32_e32 v5, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off -; GFX9-NEXT: v_mov_b32_e32 v0, s24 -; GFX9-NEXT: v_mov_b32_e32 v1, s25 -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: v_mov_b32_e32 v3, s27 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s28 +; GFX9-NEXT: v_mov_b32_e32 v3, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s25 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1280,24 +1226,18 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25 -; GFX11-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 -; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 -; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 -; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 -; GFX11-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 +; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 +; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 +; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 +; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 +; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 +; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 @@ -1315,104 +1255,86 @@ define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inr ; GFX9-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v13, v1 -; GFX9-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, s28 -; GFX9-NEXT: v_mov_b32_e32 v11, s29 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s24 -; GFX9-NEXT: v_mov_b32_e32 v1, s25 -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: v_mov_b32_e32 v3, s27 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s28 +; GFX9-NEXT: v_mov_b32_e32 v3, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s25 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_byte v[0:1], v7, off +; GFX9-NEXT: global_store_byte v[0:1], v17, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[0:1], v8, off +; GFX9-NEXT: global_store_short v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[0:1], v9, off +; GFX9-NEXT: global_store_short v[0:1], v19, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29 -; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21 -; GFX11-NEXT: v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23 -; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 -; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 -; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc +; GFX11-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v17, s27 +; GFX11-NEXT: v_dual_mov_b32 v18, s28 :: v_dual_mov_b32 v19, s29 +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s15 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 -; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11 -; GFX11-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v15, s5 -; GFX11-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 -; GFX11-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1 -; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 -; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc +; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 +; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 +; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 +; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc +; GFX11-NEXT: global_store_b8 v[0:1], v12, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b8 v[0:1], v3, off dlc +; GFX11-NEXT: global_store_b8 v[0:1], v13, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b16 v[0:1], v4, off dlc +; GFX11-NEXT: global_store_b16 v[0:1], v14, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b16 v[0:1], v5, off dlc +; GFX11-NEXT: global_store_b16 v[0:1], v15, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef @@ -1427,94 +1349,76 @@ define void @void_func_v32i32_v2i32_v2f32_inreg(<32 x i32> inreg %arg0, <2 x i32 ; GFX9-LABEL: void_func_v32i32_v2i32_v2f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v13, v1 -; GFX9-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, s28 -; GFX9-NEXT: v_mov_b32_e32 v11, s29 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s24 -; GFX9-NEXT: v_mov_b32_e32 v1, s25 -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: v_mov_b32_e32 v3, s27 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s28 +; GFX9-NEXT: v_mov_b32_e32 v3, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s25 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[6:7], off +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[16:17], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[18:19], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v2i32_v2f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29 -; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21 -; GFX11-NEXT: v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23 -; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v7, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 +; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 +; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 +; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 ; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13 -; GFX11-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v13, s15 -; GFX11-NEXT: v_dual_mov_b32 v14, s8 :: v_dual_mov_b32 v15, s9 -; GFX11-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v17, s11 -; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 -; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 -; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 -; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 -; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc +; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 +; GFX11-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 +; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[22:25], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[20:23], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off dlc +; GFX11-NEXT: global_store_b64 v[0:1], v[12:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off dlc +; GFX11-NEXT: global_store_b64 v[0:1], v[14:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef @@ -1527,156 +1431,147 @@ define void @too_many_args_use_workitem_id_x_inreg( ; GFX9-LABEL: too_many_args_use_workitem_id_x_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s6 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s7 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s16 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s8 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s12 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s23 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s24 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s15 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s25 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s16 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s26 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s17 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s18 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s19 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: global_store_dword v[0:1], v16, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s20 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s21 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s23 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s24 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s25 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: global_store_dword v[0:1], v5, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s26 ; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s27 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: global_store_dword v[0:1], v7, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s28 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: global_store_dword v[0:1], v8, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, s29 -; GFX9-NEXT: global_store_dword v[0:1], v6, off +; GFX9-NEXT: global_store_dword v[0:1], v9, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: global_store_dword v[0:1], v10, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: global_store_dword v[0:1], v11, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v[0:1], v12, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: global_store_dword v[0:1], v13, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: global_store_dword v[0:1], v14, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v5, off +; GFX9-NEXT: global_store_dword v[0:1], v15, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: too_many_args_use_workitem_id_x_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 +; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 +; GFX11-NEXT: v_mov_b32_e32 v16, s6 +; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v2, s5 -; GFX11-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v4, s7 -; GFX11-NEXT: v_mov_b32_e32 v6, s9 -; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: v_dual_mov_b32 v13, s16 :: v_dual_mov_b32 v12, s7 +; GFX11-NEXT: v_dual_mov_b32 v15, s18 :: v_dual_mov_b32 v14, s17 +; GFX11-NEXT: v_mov_b32_e32 v16, s19 +; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v2, s10 -; GFX11-NEXT: v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v4, s12 -; GFX11-NEXT: v_mov_b32_e32 v6, s14 -; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v12, s20 +; GFX11-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v14, s22 +; GFX11-NEXT: v_mov_b32_e32 v16, s24 +; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, s16 -; GFX11-NEXT: v_dual_mov_b32 v4, s17 :: v_dual_mov_b32 v5, s18 -; GFX11-NEXT: v_mov_b32_e32 v6, s19 -; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v12, s25 +; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27 +; GFX11-NEXT: v_mov_b32_e32 v16, s29 +; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s21 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_mov_b32_e32 v6, s24 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc @@ -1687,22 +1582,15 @@ define void @too_many_args_use_workitem_id_x_inreg( ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v5, s28 :: v_dual_mov_b32 v2, s25 -; GFX11-NEXT: v_dual_mov_b32 v3, s26 :: v_dual_mov_b32 v4, s27 -; GFX11-NEXT: v_mov_b32_e32 v6, s29 -; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v7, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v8, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v9, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v10, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v11, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] i32 inreg %arg0, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, @@ -1755,10 +1643,10 @@ define void @void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inreg %arg ; GFX9-LABEL: void_func_i32_v2float_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1781,24 +1669,24 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX9-LABEL: caller_void_func_i32_v2float_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, s33 +; GFX9-NEXT: s_mov_b32 s17, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s7, 2 +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s2, s6 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s2, s16 +; GFX9-NEXT: s_mov_b32 s1, s7 +; GFX9-NEXT: s_mov_b32 s0, s6 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1815,19 +1703,19 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-NEXT: s_or_saveexec_b32 s16, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: s_mov_b32 exec_lo, s16 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_getpc_b64 s[4:5] -; GFX11-NEXT: s_add_u32 s4, s4, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s5, s5, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[16:17] +; GFX11-NEXT: s_add_u32 s16, s16, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s17, s17, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s3, 2 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[16:17], s[16:17], 0x0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -1874,7 +1762,7 @@ define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 { ; GFX9-LABEL: void_func_bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1893,7 +1781,7 @@ define void @void_func_v2bf16_inreg(<2 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1912,9 +1800,9 @@ define void @void_func_v3bf16_inreg(<3 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1935,8 +1823,8 @@ define void @void_func_v4bf16_inreg(<4 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1955,10 +1843,10 @@ define void @void_func_v8bf16_inreg(<8 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1978,16 +1866,16 @@ define void @void_func_v16bf16_inreg(<16 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1995,8 +1883,8 @@ define void @void_func_v16bf16_inreg(<16 x bfloat> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -2011,10 +1899,10 @@ define void @void_func_2_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, ptr addrspa ; GFX9-LABEL: void_func_2_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2037,10 +1925,10 @@ define void @void_func_2_i64_inreg(i64 inreg %arg0, i64 inreg %arg1, ptr addrspa ; GFX9-LABEL: void_func_2_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -2066,13 +1954,13 @@ define void @void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg ; GFX9-LABEL: void_func_i64_inreg_i32_inreg_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s18 ; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -2083,7 +1971,7 @@ define void @void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s1 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v3, s6 ; GFX11-NEXT: v_mov_b32_e32 v6, s2 ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2102,19 +1990,19 @@ define void @void_func_5_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, i32 inreg % ; GFX9-LABEL: void_func_5_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2124,7 +2012,7 @@ define void @void_func_5_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, i32 inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc @@ -2148,12 +2036,12 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) ; GFX9-LABEL: void_func_a5i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:16 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2161,7 +2049,7 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) ; GFX11-LABEL: void_func_a5i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: s_clause 0x1 @@ -2176,6 +2064,93 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) declare void @extern() define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %ptr) { +; GFX9-LABEL: void_func_a13i32_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s27, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[28:29] +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48 +; GFX9-NEXT: v_mov_b32_e32 v5, s25 +; GFX9-NEXT: v_mov_b32_e32 v4, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 +; GFX9-NEXT: v_writelane_b32 v40, s27, 2 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, extern@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, extern@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_a13i32_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s23, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s24, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s24 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: s_getpc_b64 s[18:19] +; GFX11-NEXT: s_add_u32 s18, s18, extern@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s19, s19, extern@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_load_b64 s[16:17], s[18:19], 0x0 +; GFX11-NEXT: v_writelane_b32 v40, s23, 2 +; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b32 v[0:1], v14, off offset:48 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:32 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store [13 x i32] %arg0, ptr addrspace(1) %ptr call void @extern() ret void @@ -2203,6 +2178,52 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; FIXME: Should still fail define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addrspace(1) %ptr) { +; GFX9-LABEL: void_func_a16i32_inreg__noimplicit: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, s29 +; GFX9-NEXT: v_mov_b32_e32 v4, s28 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s25 +; GFX9-NEXT: v_mov_b32_e32 v4, s24 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_a16i32_inreg__noimplicit: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s25 :: v_dual_mov_b32 v4, s24 +; GFX11-NEXT: v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s22 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v13, s17 :: v_dual_mov_b32 v12, s16 +; GFX11-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 +; GFX11-NEXT: v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2 +; GFX11-NEXT: v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:48 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:32 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:16 +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off +; GFX11-NEXT: s_setpc_b64 s[30:31] store [16 x i32] %arg0, ptr addrspace(1) %ptr ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll index 1853aa9303095..2491cc0d19d5a 100644 --- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll +++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_or3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -31,7 +31,7 @@ bb: define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_or3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 @@ -61,7 +61,7 @@ bb: define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_and3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -89,7 +89,7 @@ bb: define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_and3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] @@ -122,7 +122,7 @@ bb: define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_xor3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -149,7 +149,7 @@ bb: define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_xor3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] @@ -180,7 +180,7 @@ bb: define amdgpu_kernel void @uniform_or3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_or3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -205,7 +205,7 @@ bb: define amdgpu_kernel void @uniform_or3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_or3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -232,7 +232,7 @@ bb: define amdgpu_kernel void @uniform_and3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_and3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -257,7 +257,7 @@ bb: define amdgpu_kernel void @uniform_and3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_and3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -284,7 +284,7 @@ bb: define amdgpu_kernel void @uniform_xor3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_xor3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -309,7 +309,7 @@ bb: define amdgpu_kernel void @uniform_xor3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_xor3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll index 1a9334706cb92..1feae4dae6a09 100644 --- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll @@ -106,7 +106,7 @@ define amdgpu_kernel void @gds_global_align_plus_attr(ptr addrspace(1) %out) #0 define amdgpu_kernel void @gds_extern_align(ptr addrspace(1) %out, ptr addrspace(2) %gds.arg) #0 { ; GCN-LABEL: gds_extern_align: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[0:1], 0x8 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x8 ; GCN-NEXT: v_mov_b32_e32 v0, 5 ; GCN-NEXT: s_movk_i32 m0, 0x401 ; GCN-NEXT: s_movk_i32 s1, 0x400 diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll index 944dcda5eba6f..d70d45d44af0f 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll @@ -6,12 +6,12 @@ declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) # define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: IllegalGEPConst: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s3, s2, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_ashr_i32 s1, s0, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll index 81239e841e097..0f951e89d37c8 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll @@ -45,7 +45,7 @@ ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0 -define amdgpu_kernel void @minimal_kernel_inputs() { +define amdgpu_kernel void @minimal_kernel_inputs() #0 { %id = call i32 @llvm.amdgcn.workgroup.id.x() store volatile i32 %id, ptr addrspace(1) undef ret void @@ -74,7 +74,7 @@ define amdgpu_kernel void @minimal_kernel_inputs() { ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0 -define amdgpu_kernel void @minimal_kernel_inputs_with_stack() { +define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 { %alloca = alloca i32, addrspace(5) %id = call i32 @llvm.amdgcn.workgroup.id.x() store volatile i32 %id, ptr addrspace(1) undef @@ -107,7 +107,7 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() { ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 2 -define amdgpu_kernel void @queue_ptr() { +define amdgpu_kernel void @queue_ptr() #1 { %queue.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 %load = load volatile i8, ptr addrspace(4) %queue.ptr %id = call i32 @llvm.amdgcn.workgroup.id.x() @@ -154,7 +154,7 @@ define amdgpu_kernel void @queue_ptr() { ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 -define amdgpu_kernel void @all_inputs() { +define amdgpu_kernel void @all_inputs() #2 { %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -182,16 +182,19 @@ define amdgpu_kernel void @all_inputs() { ret void } -declare i32 @llvm.amdgcn.workgroup.id.x() #0 -declare i32 @llvm.amdgcn.workgroup.id.y() #0 -declare i32 @llvm.amdgcn.workgroup.id.z() #0 -declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0 -declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 -declare align 4 ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 -declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #0 -declare i64 @llvm.amdgcn.dispatch.id() #0 - -attributes #0 = { nounwind readnone speculatable willreturn } +declare i32 @llvm.amdgcn.workgroup.id.x() #3 +declare i32 @llvm.amdgcn.workgroup.id.y() #3 +declare i32 @llvm.amdgcn.workgroup.id.z() #3 +declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3 +declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #3 +declare align 4 ptr addrspace(4) @llvm.amdgcn.queue.ptr() #3 +declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #3 +declare i64 @llvm.amdgcn.dispatch.id() #3 + +attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #2 = { "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll index be6f8a4375163..fb402b5ba30d1 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll @@ -4,29 +4,29 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GCN-NEXT: s_mov_b64 s[0:1], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GCN-NEXT: s_bcnt1_i32_b64 s1, s[6:7] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s8 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_mov_b32_e32 v5, v1 ; GCN-NEXT: v_add_f32_e32 v4, v5, v2 -; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[4:5] glc +; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[2:3] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v5 @@ -36,7 +36,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: ; %bb.3: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB0_4: ; %Flow2 -; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_readfirstlane_b32 s0, v1 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, s0 @@ -52,20 +52,20 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[2:3], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GCN-NEXT: s_mov_b64 s[0:1], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: .LBB1_2: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index e312b37b2e0bb..52fe2342d41a8 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -27,7 +27,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -36,7 +36,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -46,7 +46,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off @@ -70,7 +70,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc @@ -78,7 +78,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -100,7 +100,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -122,7 +122,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -150,7 +150,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -179,12 +179,12 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -197,7 +197,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -206,7 +206,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -216,7 +216,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -240,7 +240,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc @@ -248,7 +248,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -270,7 +270,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -293,7 +293,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -321,7 +321,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -351,12 +351,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -369,7 +369,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -378,7 +378,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -388,7 +388,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -412,7 +412,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc @@ -420,7 +420,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -442,7 +442,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -465,7 +465,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -498,7 +498,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -533,12 +533,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -551,7 +551,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -560,7 +560,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -570,7 +570,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off @@ -593,7 +593,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off @@ -601,7 +601,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off @@ -609,7 +609,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] @@ -630,7 +630,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -657,7 +657,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -685,12 +685,12 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -703,7 +703,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -712,7 +712,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -722,7 +722,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -745,7 +745,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 @@ -753,7 +753,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 @@ -761,7 +761,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -784,7 +784,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -811,7 +811,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -840,12 +840,12 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -858,7 +858,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -867,7 +867,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -877,7 +877,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 @@ -900,7 +900,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 @@ -908,7 +908,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 @@ -916,7 +916,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -939,7 +939,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -970,7 +970,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -1003,12 +1003,12 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret void } -define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1037,7 +1037,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -1046,7 +1046,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1072,7 +1072,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1096,7 +1096,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1120,7 +1120,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1142,7 +1142,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1165,7 +1165,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1193,7 +1193,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1223,12 +1223,12 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst ret float %result } -define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1255,7 +1255,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -1264,7 +1264,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 @@ -1288,7 +1288,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -1311,7 +1311,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 @@ -1334,7 +1334,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -1355,7 +1355,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1378,7 +1378,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1405,7 +1405,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1434,12 +1434,16 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst ret void } -define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1447,34 +1451,34 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1482,7 +1486,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1495,25 +1499,25 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1525,37 +1529,36 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1564,7 +1567,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1576,14 +1579,14 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1593,7 +1596,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1605,13 +1608,12 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1624,7 +1626,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -1633,7 +1635,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1643,7 +1645,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1667,7 +1669,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc @@ -1675,7 +1677,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1697,7 +1699,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1720,7 +1722,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1748,7 +1750,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1778,12 +1780,12 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1791,154 +1793,181 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v6, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: v_add_f32_e32 v5, v6, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + ret float %result } -define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1946,171 +1975,151 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + ret void } -define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2118,39 +2127,38 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc @@ -2158,131 +2166,115 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 - ret float %result + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + ret void } -define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2290,210 +2282,234 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 - ret float %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + ret void } -define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc @@ -2501,147 +2517,207 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst + ret float %result } -define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -2664,23 +2740,51 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -2703,7 +2807,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -2730,7 +2834,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -2759,539 +2863,762 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst ret void } -define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fadd_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 - ret void + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + ret double %result } -define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +define double @global_agent_atomic_fadd_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret double %result } -define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +define double @global_agent_atomic_fadd_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret float %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret double %result } -define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -3299,321 +3626,406 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v11, v7 +; GFX7-NEXT: v_mov_b32_e32 v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v11, v7 +; GFX6-NEXT: v_mov_b32_e32 v10, v6 +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst ret void } -define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +define void @global_agent_atomic_fadd_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v11, v7 +; GFX7-NEXT: v_mov_b32_e32 v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v11, v7 +; GFX6-NEXT: v_mov_b32_e32 v10, v6 +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 - ret float %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret void } -define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 @@ -3621,994 +4033,1971 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v11, v7 +; GFX7-NEXT: v_mov_b32_e32 v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v11, v7 +; GFX6-NEXT: v_mov_b32_e32 v10, v6 +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst ret void } -define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; -------------------------------------------------------------------- +; half +; -------------------------------------------------------------------- + +define half @global_agent_atomic_fadd_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 +; GFX7-NEXT: v_not_b32_e32 v7, v2 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 +; GFX6-NEXT: v_not_b32_e32 v7, v2 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v7 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0 - ret float %result + %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + ret half %result } -define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0 - ret void + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + ret half %result } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fadd_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result -} + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + ret half %result + } -define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result + %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + ret void } -define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 -; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_add_f32_e32 v5, v6, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: v_add_f32_e32 v5, v6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + ret void } -define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4623,31 +6012,91 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4660,264 +6109,451 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_e32 v0, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + ret half %result } -define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -4930,33 +6566,65 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4969,310 +6637,460 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 ret void } -define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +define half @global_system_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst + ret half %result } -define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5285,26 +7103,66 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -5318,17 +7176,27 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -5341,39 +7209,61 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -5385,17 +7275,27 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5408,605 +7308,1461 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst ret void } -define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; -------------------------------------------------------------------- +; bfloat +; -------------------------------------------------------------------- + +define bfloat @global_agent_atomic_fadd_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 - ret float %result + %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + ret bfloat %result } -define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 - ret void + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + ret bfloat %result } -define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret float %result -} + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + ret bfloat %result + } -define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6021,31 +8777,109 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6058,284 +8892,661 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst ret void } -define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 - ret float %result + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + ret void } -define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB37_1 @@ -6343,796 +9554,1298 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst ret void } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 +; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v1 -; GFX6-NEXT: v_mov_b32_e32 v10, v0 -; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v8 -; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 -; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + ret bfloat %result } -define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 +; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v1 -; GFX6-NEXT: v_mov_b32_e32 v10, v0 -; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v8 -; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 -; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + ret void } -define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v1 -; GFX6-NEXT: v_mov_b32_e32 v10, v0 -; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v8 -; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 -; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst + ret bfloat %result } -define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7141,31 +10854,90 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7174,23 +10946,38 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB41_1 @@ -7198,1680 +10985,1371 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 -; GFX7-NEXT: v_mov_b32_e32 v9, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst ret void } -define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; -------------------------------------------------------------------- +; <2 x half> +; -------------------------------------------------------------------- + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 -; GFX7-NEXT: v_mov_b32_e32 v9, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result } -define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 -; GFX7-NEXT: v_mov_b32_e32 v9, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result } -; -------------------------------------------------------------------- -; half -; -------------------------------------------------------------------- - -define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret half %result + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result } -define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret half %result + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret void } -define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret half %result - } + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret void +} -define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -8885,26 +12363,17 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -8917,88 +12386,35 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -9011,10205 +12427,116 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_not_b32_e32 v6, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_e32 v0, v1, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 - ret half %result -} - -define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret half %result -} - -define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -; -------------------------------------------------------------------- -; bfloat -; -------------------------------------------------------------------- - -define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret bfloat %result -} - -define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret bfloat %result -} - -define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret bfloat %result - } - -define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB58_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB58_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB58_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB58_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB58_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB59_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB59_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB59_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB59_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB59_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB60_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB60_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB60_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB60_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB60_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 - ret bfloat %result -} - -define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB61_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB61_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB61_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB61_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB61_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB62_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB62_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB62_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB62_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB62_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB62_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB62_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB62_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB62_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret bfloat %result -} - -define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB63_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB63_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB63_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB63_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB63_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB63_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB63_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB63_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB63_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -; -------------------------------------------------------------------- -; <2 x half> -; -------------------------------------------------------------------- - -define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB64_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB64_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB64_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB64_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB64_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB64_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB64_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB64_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB64_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB64_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x half> %result -} - -define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB65_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB65_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB65_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB65_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB65_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB65_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB65_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB65_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB65_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB65_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x half> %result -} - -define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB66_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB66_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB66_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB66_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB66_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB66_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB66_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB66_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB66_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB66_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x half> %result -} - -define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB67_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB67_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB67_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB67_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB67_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB67_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB67_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB67_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB68_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB68_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB68_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB68_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB68_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB68_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB69_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB69_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB69_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB69_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB69_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB69_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB70_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB70_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB70_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB70_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB70_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB70_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB70_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x half> %result -} - -define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB71_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB71_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB71_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB71_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB71_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB71_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB71_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB72_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB72_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB72_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB72_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB72_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB72_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB72_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret <2 x half> %result -} - -define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB73_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB73_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB73_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB73_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB73_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB73_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret void -} - -define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB74_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB74_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB74_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB74_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB74_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB74_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB74_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 - ret <2 x half> %result -} - -define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB75_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB75_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB75_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB75_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB75_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB75_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 - ret void -} - -define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB76_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB76_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB76_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB76_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB76_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB76_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB76_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result -} - -define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB77_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB77_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB77_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB77_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB77_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB77_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB77_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst - ret void -} - -; -------------------------------------------------------------------- -; <2 x bfloat> -; -------------------------------------------------------------------- - -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB78_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB78_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB78_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB78_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB78_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB78_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB78_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x bfloat> %result -} - -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB79_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB79_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB79_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB79_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB79_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB79_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB79_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x bfloat> %result -} - -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB80_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB80_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB80_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB80_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB80_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB80_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB80_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB80_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB80_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB80_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB80_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret <2 x bfloat> %result -} - -define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB81_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB81_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB81_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB81_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB81_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB81_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB81_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB81_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB81_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB81_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB81_1 +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -19217,227 +12544,128 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB82_1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB82_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB82_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB82_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB82_1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB82_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -19445,42 +12673,48 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB82_1 +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -19488,49 +12722,55 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB82_1 +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst + ret <2 x half> %result } -define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -19538,328 +12778,229 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB83_1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB83_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB83_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB83_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB83_1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB83_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB83_1 +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB83_1 +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst ret void } -define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; -------------------------------------------------------------------- +; <2 x bfloat> +; -------------------------------------------------------------------- + +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -19867,30 +13008,30 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -19915,7 +13056,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -19923,21 +13064,21 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB84_1 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -19957,29 +13098,29 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB84_1 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -19998,30 +13139,28 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB84_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -20040,68 +13179,67 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB84_1 +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB84_1 +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -20110,7 +13248,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -20124,7 +13262,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -20132,21 +13270,21 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB84_1 +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -20155,7 +13293,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -20170,7 +13308,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -20178,20 +13316,19 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB84_1 +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -20199,229 +13336,232 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB85_1 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB85_1 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB85_1 +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB85_1 +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -20430,41 +13570,43 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB85_1 +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -20473,48 +13615,50 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB85_1 +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -20522,30 +13666,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -20570,7 +13714,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -20578,21 +13722,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB86_1 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -20612,29 +13756,29 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB86_1 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -20653,28 +13797,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -20693,156 +13837,162 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB86_1 +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB86_1 +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB86_1 +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB86_1 +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -20855,7 +14005,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -20864,7 +14014,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -20873,7 +14023,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20905,20 +14055,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB87_1 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20945,12 +14095,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB87_1 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -20959,7 +14109,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20984,12 +14134,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -20998,7 +14148,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21023,19 +14173,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB87_1 +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21063,12 +14213,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB87_1 +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -21084,7 +14234,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -21106,12 +14256,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB87_1 +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -21127,7 +14277,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -21150,17 +14300,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB87_1 +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -21168,327 +14318,320 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB88_1 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB88_1 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB88_1 +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB88_1 +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB88_1 +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB88_1 +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 - ret <2 x bfloat> %result + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret void } -define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -21496,30 +14639,30 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21542,7 +14685,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -21551,20 +14694,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB89_1 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21583,7 +14726,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -21591,21 +14734,21 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB89_1 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21623,28 +14766,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB89_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21662,26 +14805,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB89_1 +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -21709,28 +14854,32 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB89_1 +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -21752,28 +14901,32 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB89_1 +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -21796,17 +14949,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB89_1 +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -21814,30 +14968,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -21862,7 +15016,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -21870,21 +15024,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB90_1 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -21904,29 +15058,29 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB90_1 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -21945,28 +15099,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB90_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -21985,67 +15141,68 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB90_1 +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX8-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB90_1 +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -22054,7 +15211,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -22068,7 +15225,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -22076,21 +15233,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB90_1 +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -22099,7 +15256,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -22114,7 +15271,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -22122,19 +15279,20 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB90_1 +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result } -define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -22142,30 +15300,30 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22188,7 +15346,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -22197,20 +15355,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB91_1 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22229,7 +15387,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -22237,21 +15395,21 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB91_1 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22269,28 +15427,30 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB91_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22308,26 +15468,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB91_1 +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -22355,19 +15517,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB91_1 +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -22376,7 +15538,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -22390,7 +15552,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -22398,19 +15560,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB91_1 +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -22419,7 +15581,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -22434,7 +15596,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -22442,12 +15604,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB91_1 +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst ret void } @@ -22458,162 +15621,162 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-LABEL: infer_as_before_atomic: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_mov_b32 s3, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB92_2 +; GFX12-NEXT: s_cbranch_execz .LBB58_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX12-NEXT: .LBB92_2: +; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GFX12-NEXT: .LBB58_2: ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX940-LABEL: infer_as_before_atomic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB92_2 +; GFX940-NEXT: s_cbranch_execz .LBB58_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX940-NEXT: .LBB92_2: +; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GFX940-NEXT: .LBB58_2: ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: infer_as_before_atomic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_mov_b32 s3, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB92_2 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX11-NEXT: .LBB92_2: +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GFX11-NEXT: .LBB58_2: ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: infer_as_before_atomic: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s3, exec_lo -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB92_3 +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB58_3 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: .LBB92_2: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: .LBB58_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB92_2 -; GFX10-NEXT: .LBB92_3: +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB58_2 +; GFX10-NEXT: .LBB58_3: ; GFX10-NEXT: s_endpgm ; ; GFX90A-LABEL: infer_as_before_atomic: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB92_2 +; GFX90A-NEXT: s_cbranch_execz .LBB58_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX90A-NEXT: .LBB92_2: +; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-NEXT: .LBB58_2: ; GFX90A-NEXT: s_endpgm ; ; GFX908-LABEL: infer_as_before_atomic: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_mov_b64 s[2:3], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX908-NEXT: s_mov_b64 s[0:1], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_cbranch_execz .LBB92_2 +; GFX908-NEXT: s_cbranch_execz .LBB58_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX908-NEXT: .LBB92_2: +; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GFX908-NEXT: .LBB58_2: ; GFX908-NEXT: s_endpgm ; ; GFX8-LABEL: infer_as_before_atomic: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB92_3 +; GFX8-NEXT: s_cbranch_execz .LBB58_3 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v4, s2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX8-NEXT: s_bcnt1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v4, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: .LBB92_2: ; %atomicrmw.start +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: .LBB58_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v4 ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -22622,32 +15785,32 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB92_2 -; GFX8-NEXT: .LBB92_3: +; GFX8-NEXT: s_cbranch_execnz .LBB58_2 +; GFX8-NEXT: .LBB58_3: ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: infer_as_before_atomic: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b64 s[2:3], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB92_3 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_cbranch_execz .LBB58_3 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[4:5] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: .LBB92_2: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: .LBB58_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 @@ -22658,32 +15821,32 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB92_2 -; GFX7-NEXT: .LBB92_3: +; GFX7-NEXT: s_cbranch_execnz .LBB58_2 +; GFX7-NEXT: .LBB58_3: ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: infer_as_before_atomic: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX6-NEXT: s_cbranch_execz .LBB92_3 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_cbranch_execz .LBB58_3 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_bcnt1_i32_b64 s6, s[4:5] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: .LBB92_2: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB58_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -22695,15 +15858,13 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB92_2 -; GFX6-NEXT: .LBB92_3: +; GFX6-NEXT: s_cbranch_execnz .LBB58_2 +; GFX6-NEXT: .LBB58_3: ; GFX6-NEXT: s_endpgm %load = load ptr, ptr addrspace(4) %arg - %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4 ret void } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } - -!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index e7d62fdc00cff..ae5dca4aa86fb 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -27,7 +27,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -52,7 +52,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -62,7 +62,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -72,7 +72,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -96,7 +96,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -120,7 +120,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -144,7 +144,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -157,7 +157,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -170,12 +170,12 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -188,7 +188,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -213,7 +213,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -223,7 +223,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -233,7 +233,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -257,7 +257,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -281,7 +281,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -306,7 +306,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -319,7 +319,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -333,12 +333,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -351,7 +351,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -376,7 +376,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -386,7 +386,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -396,7 +396,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -420,7 +420,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -444,7 +444,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -469,7 +469,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -482,7 +482,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -496,12 +496,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -514,7 +514,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -538,7 +538,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -548,7 +548,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -558,7 +558,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -581,7 +581,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -604,7 +604,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -627,7 +627,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -639,7 +639,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -651,12 +651,12 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -669,7 +669,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -693,7 +693,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -703,7 +703,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -713,7 +713,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -736,7 +736,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -759,7 +759,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -784,7 +784,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -796,7 +796,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -809,12 +809,12 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -827,7 +827,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -851,7 +851,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -861,7 +861,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -871,7 +871,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -894,7 +894,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -917,7 +917,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -942,7 +942,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -954,7 +954,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -967,12 +967,12 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret void } -define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1003,7 +1003,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1028,7 +1028,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1056,7 +1056,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1082,7 +1082,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1108,7 +1108,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1132,7 +1132,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1157,7 +1157,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1187,7 +1187,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1219,12 +1219,12 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst ret float %result } -define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1254,7 +1254,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1278,7 +1278,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1305,7 +1305,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1330,7 +1330,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1355,7 +1355,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1378,7 +1378,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1403,7 +1403,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1432,7 +1432,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1463,12 +1463,16 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst ret void } -define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1481,7 +1485,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -1506,7 +1510,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1516,7 +1520,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1526,7 +1530,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -1550,7 +1554,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -1574,7 +1578,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -1598,7 +1602,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1611,7 +1615,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1624,12 +1628,12 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1637,15 +1641,15 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1655,7 +1659,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1667,30 +1671,30 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1699,7 +1703,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1711,10 +1715,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1723,7 +1727,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1735,66 +1739,64 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1802,15 +1804,15 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1820,7 +1822,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1832,30 +1834,30 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1864,7 +1866,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1876,10 +1878,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1888,7 +1890,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1900,62 +1902,64 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1963,162 +1967,154 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result + %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + ret void } -define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2126,162 +2122,157 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + ret void } -define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2289,15 +2280,15 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2306,7 +2297,7 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2318,30 +2309,30 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2349,7 +2340,7 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2361,10 +2352,10 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2372,7 +2363,7 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2384,9 +2375,11 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2407,210 +2400,322 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX6-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst + ret float %result } -define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2618,10 +2723,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 @@ -2631,30 +2736,62 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2662,8 +2799,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2674,10 +2813,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2685,7 +2824,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2697,11 +2836,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2722,890 +2861,86 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result -} - -define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result -} - -define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst + ret void } -define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3613,35 +2948,35 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3649,7 +2984,7 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3657,89 +2992,89 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -3747,13 +3082,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3761,355 +3096,373 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst ret double %result } -define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret double %result } -define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret double %result } -define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -4117,34 +3470,34 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -4153,41 +3506,41 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] @@ -4195,20 +3548,18 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4221,388 +3572,373 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst ret void } -define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: +define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret double %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret void } -define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 - ret double %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret void } ; -------------------------------------------------------------------- ; half ; -------------------------------------------------------------------- -define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4620,7 +3956,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4641,13 +3977,13 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -4660,7 +3996,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4676,13 +4012,13 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cbranch_execnz .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -4696,7 +4032,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4718,13 +4054,13 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -4736,7 +4072,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4753,13 +4089,13 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -4772,7 +4108,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4787,13 +4123,13 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -4806,7 +4142,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4821,13 +4157,13 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -4840,7 +4176,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4856,13 +4192,13 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4879,7 +4215,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4898,14 +4234,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -4922,7 +4258,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4941,19 +4277,19 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst ret half %result } -define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4972,7 +4308,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4993,13 +4329,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -5014,7 +4350,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -5030,13 +4366,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cbranch_execnz .LBB23_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -5051,7 +4387,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -5073,13 +4409,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -5092,7 +4428,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -5109,13 +4445,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_cbranch_execnz .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -5129,7 +4465,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -5144,13 +4480,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -5164,7 +4500,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -5179,13 +4515,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -5199,7 +4535,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -5215,13 +4551,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5239,7 +4575,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5258,14 +4594,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5283,7 +4619,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5303,7 +4639,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -5311,12 +4647,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst ret half %result } -define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5335,7 +4671,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -5356,13 +4692,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -5378,7 +4714,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -5394,13 +4730,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_cbranch_execnz .LBB24_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -5415,7 +4751,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -5437,13 +4773,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -5456,7 +4792,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -5473,13 +4809,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -5493,7 +4829,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -5508,13 +4844,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -5528,7 +4864,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -5543,13 +4879,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -5563,7 +4899,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -5579,13 +4915,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -5603,7 +4939,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5622,14 +4958,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -5647,7 +4983,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5667,7 +5003,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -5675,12 +5011,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst ret half %result } -define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5698,7 +5034,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5719,12 +5055,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -5737,7 +5073,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5753,12 +5089,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cbranch_execnz .LBB25_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -5772,7 +5108,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5794,12 +5130,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -5811,7 +5147,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5828,12 +5164,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_cbranch_execnz .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -5846,7 +5182,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5861,12 +5197,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -5879,7 +5215,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5894,12 +5230,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -5912,7 +5248,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5928,12 +5264,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5950,7 +5286,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5969,12 +5305,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -5991,7 +5327,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -6011,17 +5347,17 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6040,7 +5376,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6061,12 +5397,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -6081,7 +5417,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6097,12 +5433,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -6117,7 +5453,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6139,12 +5475,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -6157,7 +5493,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6174,12 +5510,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -6193,7 +5529,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6208,12 +5544,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -6227,7 +5563,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6242,12 +5578,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -6261,7 +5597,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6277,12 +5613,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6300,7 +5636,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6319,12 +5655,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6342,7 +5678,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6362,18 +5698,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6392,7 +5728,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6413,12 +5749,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -6434,7 +5770,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6450,12 +5786,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -6470,7 +5806,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6492,12 +5828,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -6510,7 +5846,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6527,12 +5863,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -6546,7 +5882,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6561,12 +5897,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -6580,7 +5916,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6595,12 +5931,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -6614,7 +5950,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6630,12 +5966,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6653,7 +5989,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6672,12 +6008,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6695,7 +6031,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6715,18 +6051,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst ret void } -define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6736,7 +6072,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -6754,20 +6090,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -6781,19 +6117,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -6812,19 +6148,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -6840,20 +6176,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -6866,20 +6202,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -6892,13 +6228,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6906,7 +6242,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6920,12 +6256,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6936,7 +6272,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6953,13 +6289,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6970,7 +6306,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6988,19 +6324,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 ret half %result } -define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7010,7 +6346,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -7028,19 +6364,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7054,18 +6390,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7084,18 +6420,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7111,19 +6447,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_cbranch_execnz .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7136,19 +6472,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7161,12 +6497,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -7174,7 +6510,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7188,12 +6524,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -7204,7 +6540,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7221,12 +6557,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -7237,7 +6573,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7255,18 +6591,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 ret void } -define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7285,7 +6621,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7306,13 +6642,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7327,7 +6663,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7343,13 +6679,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7364,7 +6700,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7386,13 +6722,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7405,7 +6741,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7422,13 +6758,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7442,7 +6778,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7459,13 +6795,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7479,7 +6815,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7494,13 +6830,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -7514,7 +6850,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -7530,13 +6866,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7554,7 +6890,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -7573,14 +6909,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7598,7 +6934,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -7618,7 +6954,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -7626,12 +6962,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst ret half %result } -define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7650,7 +6986,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7671,12 +7007,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7691,7 +7027,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7707,12 +7043,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7727,7 +7063,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7749,12 +7085,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7767,7 +7103,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7784,12 +7120,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_cbranch_execnz .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7803,7 +7139,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7820,12 +7156,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7839,7 +7175,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7854,12 +7190,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -7873,7 +7209,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7889,12 +7225,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7912,7 +7248,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7931,12 +7267,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7954,7 +7290,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7974,13 +7310,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst ret void } @@ -7988,8 +7324,8 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; bfloat ; -------------------------------------------------------------------- -define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8006,7 +7342,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8034,13 +7370,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -8054,7 +7390,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8076,13 +7412,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -8096,7 +7432,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8125,13 +7461,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -8143,7 +7479,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8164,13 +7500,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -8184,7 +7520,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8203,13 +7539,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -8223,7 +7559,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8242,13 +7578,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -8261,7 +7597,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8282,13 +7618,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -8305,7 +7641,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -8325,14 +7661,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -8349,7 +7685,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -8369,19 +7705,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst ret bfloat %result } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8400,7 +7736,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8428,13 +7764,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -8450,7 +7786,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8472,13 +7808,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -8494,7 +7830,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8523,13 +7859,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -8542,7 +7878,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8563,13 +7899,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -8584,7 +7920,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8603,13 +7939,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -8624,7 +7960,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8643,13 +7979,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -8663,7 +7999,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8684,13 +8020,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8708,7 +8044,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8728,14 +8064,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8753,7 +8089,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8774,7 +8110,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8782,12 +8118,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8806,7 +8142,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8834,13 +8170,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -8857,7 +8193,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8879,13 +8215,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_cbranch_execnz .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8901,7 +8237,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8930,13 +8266,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8949,7 +8285,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8970,13 +8306,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8991,7 +8327,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -9010,13 +8346,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -9031,7 +8367,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -9050,13 +8386,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -9070,7 +8406,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9091,13 +8427,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9115,7 +8451,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9135,14 +8471,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9160,7 +8496,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9181,7 +8517,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -9189,12 +8525,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } -define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9211,7 +8547,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9238,12 +8574,12 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -9257,7 +8593,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9279,12 +8615,12 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -9298,7 +8634,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9326,12 +8662,12 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -9343,7 +8679,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9364,12 +8700,12 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_cbranch_execnz .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -9383,7 +8719,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9402,12 +8738,12 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -9421,7 +8757,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9440,12 +8776,12 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -9458,7 +8794,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9479,12 +8815,12 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -9501,7 +8837,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9521,12 +8857,12 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -9543,7 +8879,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9564,17 +8900,17 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9593,7 +8929,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9620,12 +8956,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -9641,7 +8977,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9663,12 +8999,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cbranch_execnz .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9684,7 +9020,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9712,12 +9048,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9730,7 +9066,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9751,12 +9087,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9771,7 +9107,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9790,12 +9126,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9810,7 +9146,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9829,12 +9165,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -9848,7 +9184,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9869,12 +9205,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9892,7 +9228,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9912,12 +9248,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9935,7 +9271,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9956,18 +9292,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9986,7 +9322,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10013,12 +9349,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -10035,7 +9371,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10057,12 +9393,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cbranch_execnz .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -10078,7 +9414,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10106,12 +9442,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -10124,7 +9460,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10145,12 +9481,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -10165,7 +9501,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10184,12 +9520,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -10204,7 +9540,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10223,12 +9559,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cbranch_execnz .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -10242,7 +9578,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10263,12 +9599,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -10286,7 +9622,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10306,12 +9642,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -10329,7 +9665,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10350,18 +9686,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst ret void } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10371,7 +9707,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -10396,13 +9732,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10410,7 +9746,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -10431,20 +9767,20 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_cbranch_execnz .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -10470,19 +9806,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -10503,13 +9839,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10517,7 +9853,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -10536,13 +9872,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10550,7 +9886,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -10569,13 +9905,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -10583,7 +9919,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -10603,12 +9939,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -10619,7 +9955,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10637,13 +9973,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10654,7 +9990,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10673,19 +10009,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 ret bfloat %result } -define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10695,7 +10031,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10719,12 +10055,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10732,7 +10068,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10753,19 +10089,19 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cbranch_execnz .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10790,18 +10126,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10822,12 +10158,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10835,7 +10171,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10854,12 +10190,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10867,7 +10203,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10886,12 +10222,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -10899,7 +10235,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10919,12 +10255,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -10935,7 +10271,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10953,12 +10289,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10969,7 +10305,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10988,18 +10324,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 ret void } -define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11018,7 +10354,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -11046,13 +10382,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -11068,7 +10404,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -11090,13 +10426,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_cbranch_execnz .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -11112,7 +10448,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -11141,13 +10477,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -11160,7 +10496,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -11181,13 +10517,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -11202,7 +10538,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -11223,13 +10559,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -11244,7 +10580,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -11263,13 +10599,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -11283,7 +10619,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -11304,13 +10640,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11328,7 +10664,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -11348,14 +10684,14 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11373,7 +10709,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -11394,7 +10730,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -11402,12 +10738,12 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst ret bfloat %result } -define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11426,7 +10762,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11453,12 +10789,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -11474,7 +10810,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11496,12 +10832,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cbranch_execnz .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -11517,7 +10853,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11545,12 +10881,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_cbranch_execnz .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -11563,7 +10899,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11584,12 +10920,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_cbranch_execnz .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -11604,7 +10940,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11625,12 +10961,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -11645,7 +10981,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11664,12 +11000,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -11683,7 +11019,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11704,12 +11040,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11727,7 +11063,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11747,12 +11083,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11770,7 +11106,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11791,13 +11127,13 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst ret void } @@ -11805,8 +11141,8 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11816,7 +11152,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11831,19 +11167,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11857,19 +11193,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_cbranch_execnz .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11885,19 +11221,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11911,19 +11247,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11935,19 +11271,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11959,20 +11295,20 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -11987,13 +11323,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12010,7 +11346,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -12035,14 +11371,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12059,7 +11395,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -12085,19 +11421,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12107,7 +11443,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -12122,19 +11458,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -12148,19 +11484,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cbranch_execnz .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -12176,19 +11512,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -12202,19 +11538,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -12226,19 +11562,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -12250,13 +11586,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -12265,7 +11601,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -12280,12 +11616,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12302,7 +11638,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -12327,14 +11663,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12351,7 +11687,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -12377,7 +11713,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -12385,12 +11721,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12400,7 +11736,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -12415,19 +11751,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -12441,19 +11777,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_cbranch_execnz .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -12469,19 +11805,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -12495,19 +11831,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -12519,19 +11855,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -12543,13 +11879,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -12558,7 +11894,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -12573,12 +11909,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -12599,7 +11935,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12624,12 +11960,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12650,7 +11986,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12676,18 +12012,18 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } -define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12697,7 +12033,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12712,18 +12048,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12737,18 +12073,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cbranch_execnz .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12764,18 +12100,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12789,18 +12125,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12812,18 +12148,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12835,19 +12171,19 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cbranch_execnz .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12862,12 +12198,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12884,7 +12220,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12909,12 +12245,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12931,7 +12267,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12957,17 +12293,17 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12977,7 +12313,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12992,18 +12328,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13017,18 +12353,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cbranch_execnz .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13044,18 +12380,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13069,18 +12405,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13092,18 +12428,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13115,12 +12451,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cbranch_execnz .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -13129,7 +12465,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -13144,12 +12480,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13166,7 +12502,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13191,12 +12527,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13213,7 +12549,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13239,18 +12575,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13260,7 +12596,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -13275,18 +12611,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13300,18 +12636,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cbranch_execnz .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13327,18 +12663,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_cbranch_execnz .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13352,18 +12688,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_cbranch_execnz .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13375,18 +12711,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13398,12 +12734,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cbranch_execnz .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -13412,7 +12748,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -13427,12 +12763,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -13453,7 +12789,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13478,12 +12814,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -13504,7 +12840,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13530,18 +12866,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst ret void } -define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13551,7 +12887,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -13566,19 +12902,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -13592,19 +12928,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_cbranch_execnz .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -13620,19 +12956,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -13646,19 +12982,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13672,19 +13008,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13696,13 +13032,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -13711,7 +13047,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -13726,12 +13062,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13748,7 +13084,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13773,14 +13109,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13797,7 +13133,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13823,7 +13159,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -13831,12 +13167,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst ret <2 x half> %result } -define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13846,7 +13182,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -13861,18 +13197,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13886,18 +13222,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13913,18 +13249,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13938,18 +13274,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13963,18 +13299,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13986,12 +13322,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -14000,7 +13336,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -14015,12 +13351,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14037,7 +13373,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -14062,12 +13398,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14084,7 +13420,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -14110,13 +13446,13 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst ret void } @@ -14124,8 +13460,8 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14136,7 +13472,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14168,13 +13504,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -14183,7 +13519,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -14210,13 +13546,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -14225,7 +13561,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14258,21 +13594,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14299,13 +13635,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -14314,7 +13650,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14339,13 +13675,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -14354,7 +13690,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14379,20 +13715,20 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -14420,13 +13756,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14442,7 +13778,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14464,14 +13800,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14487,7 +13823,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14510,19 +13846,19 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14533,7 +13869,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14565,13 +13901,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14580,7 +13916,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -14607,13 +13943,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -14622,7 +13958,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14655,21 +13991,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14696,13 +14032,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14711,7 +14047,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14736,13 +14072,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14751,7 +14087,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14776,13 +14112,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -14791,7 +14127,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14819,12 +14155,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14840,7 +14176,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14862,14 +14198,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14885,7 +14221,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14908,7 +14244,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -14916,12 +14252,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14932,7 +14268,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14964,13 +14300,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14979,7 +14315,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -15006,13 +14342,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_cbranch_execnz .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -15021,7 +14357,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -15054,21 +14390,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -15095,13 +14431,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15110,7 +14446,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -15135,13 +14471,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15150,7 +14486,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -15175,13 +14511,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -15190,7 +14526,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -15218,12 +14554,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -15243,7 +14579,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -15265,12 +14601,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -15290,7 +14626,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -15313,18 +14649,18 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15335,7 +14671,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15366,12 +14702,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -15380,7 +14716,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15407,12 +14743,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -15421,7 +14757,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15453,20 +14789,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15493,12 +14829,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -15507,7 +14843,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15532,12 +14868,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -15546,7 +14882,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15571,19 +14907,19 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15611,12 +14947,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15632,7 +14968,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15654,12 +14990,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15675,7 +15011,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15698,17 +15034,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15719,7 +15055,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15750,12 +15086,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15764,7 +15100,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15791,12 +15127,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15805,7 +15141,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15837,20 +15173,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15877,12 +15213,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15891,7 +15227,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15916,12 +15252,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15930,7 +15266,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15955,12 +15291,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB58_1 +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -15969,7 +15305,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15997,12 +15333,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16018,7 +15354,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16040,12 +15376,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16061,7 +15397,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16084,18 +15420,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB58_1 +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16106,7 +15442,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16137,12 +15473,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -16151,7 +15487,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16178,12 +15514,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -16192,7 +15528,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16224,20 +15560,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16264,12 +15600,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -16278,7 +15614,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16303,12 +15639,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -16317,7 +15653,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16342,12 +15678,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB59_1 +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -16356,7 +15692,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16384,12 +15720,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -16409,7 +15745,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16431,12 +15767,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -16456,7 +15792,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16479,18 +15815,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB59_1 +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16501,7 +15837,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -16533,13 +15869,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16548,7 +15884,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -16575,13 +15911,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-NEXT: s_cbranch_execnz .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -16590,7 +15926,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -16623,21 +15959,21 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16664,13 +16000,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16679,7 +16015,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16706,13 +16042,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16721,7 +16057,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16746,13 +16082,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB60_1 +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -16761,7 +16097,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16789,12 +16125,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16810,7 +16146,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16832,14 +16168,14 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16855,7 +16191,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16878,7 +16214,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB60_1 +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -16886,12 +16222,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result } -define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16902,7 +16238,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16933,12 +16269,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16947,7 +16283,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16974,12 +16310,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -16988,7 +16324,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17020,20 +16356,20 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17060,12 +16396,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -17074,7 +16410,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17101,12 +16437,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -17115,7 +16451,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17140,12 +16476,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB61_1 +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -17154,7 +16490,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17182,12 +16518,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -17203,7 +16539,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17225,12 +16561,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -17246,7 +16582,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17269,17 +16605,15 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB61_1 +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst ret void } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } - -!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 91a8ac7c935b6..915ce7433f5b0 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -27,7 +27,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -52,7 +52,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -62,7 +62,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -72,7 +72,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -96,7 +96,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -120,7 +120,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -144,7 +144,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -157,7 +157,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -170,12 +170,12 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -188,7 +188,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -213,7 +213,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -223,7 +223,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -233,7 +233,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -257,7 +257,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -281,7 +281,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -306,7 +306,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -319,7 +319,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -333,12 +333,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -351,7 +351,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -376,7 +376,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -386,7 +386,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -396,7 +396,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -420,7 +420,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -444,7 +444,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -469,7 +469,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -482,7 +482,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -496,12 +496,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -514,7 +514,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -538,7 +538,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -548,7 +548,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -558,7 +558,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -581,7 +581,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -604,7 +604,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -627,7 +627,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -639,7 +639,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -651,12 +651,12 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -669,7 +669,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -693,7 +693,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -703,7 +703,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -713,7 +713,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -736,7 +736,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -759,7 +759,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -784,7 +784,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -796,7 +796,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -809,12 +809,12 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -827,7 +827,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -851,7 +851,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -861,7 +861,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -871,7 +871,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -894,7 +894,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -917,7 +917,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -942,7 +942,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -954,7 +954,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -967,12 +967,12 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret void } -define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1003,7 +1003,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1028,7 +1028,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1056,7 +1056,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1082,7 +1082,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1108,7 +1108,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1132,7 +1132,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1157,7 +1157,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1187,7 +1187,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1219,12 +1219,12 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst ret float %result } -define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1254,7 +1254,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1278,7 +1278,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1305,7 +1305,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1330,7 +1330,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1355,7 +1355,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1378,7 +1378,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1403,7 +1403,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1432,7 +1432,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1463,12 +1463,16 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst ret void } -define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1481,7 +1485,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -1506,7 +1510,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1516,7 +1520,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1526,7 +1530,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -1550,7 +1554,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -1574,7 +1578,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -1598,7 +1602,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1611,7 +1615,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1624,12 +1628,12 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1637,15 +1641,15 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1655,7 +1659,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1667,30 +1671,30 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1699,7 +1703,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1711,10 +1715,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1723,7 +1727,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1735,66 +1739,64 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: +define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1802,15 +1804,15 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1820,7 +1822,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1832,30 +1834,30 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1864,7 +1866,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1876,10 +1878,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1888,7 +1890,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1900,62 +1902,64 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1963,162 +1967,154 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result + %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + ret void } -define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2126,162 +2122,157 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + ret void } -define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2289,15 +2280,15 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2306,7 +2297,7 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2318,30 +2309,30 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2349,7 +2340,7 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2361,10 +2352,10 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2372,7 +2363,7 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2384,9 +2375,11 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2407,210 +2400,322 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX6-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst + ret float %result } -define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2618,10 +2723,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 @@ -2631,30 +2736,62 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2662,8 +2799,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2674,10 +2813,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2685,7 +2824,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2697,11 +2836,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2722,890 +2861,86 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret float %result -} - -define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result -} - -define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret double %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst + ret void } -define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3613,35 +2948,35 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3649,7 +2984,7 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3657,89 +2992,89 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -3747,13 +3082,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3761,355 +3096,373 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst ret double %result } -define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret double %result } -define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret double %result } -define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -4117,34 +3470,34 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -4153,41 +3506,41 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] @@ -4195,20 +3548,18 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -4221,388 +3572,373 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst ret void } -define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: +define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 - ret double %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret void } -define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 - ret double %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + ret void } ; -------------------------------------------------------------------- ; half ; -------------------------------------------------------------------- -define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4620,7 +3956,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4641,13 +3977,13 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -4660,7 +3996,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4676,13 +4012,13 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cbranch_execnz .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -4696,7 +4032,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4718,13 +4054,13 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -4736,7 +4072,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4753,13 +4089,13 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -4772,7 +4108,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4787,13 +4123,13 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -4806,7 +4142,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4821,13 +4157,13 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -4840,7 +4176,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4856,13 +4192,13 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4879,7 +4215,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4898,14 +4234,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -4922,7 +4258,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4941,19 +4277,19 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst ret half %result } -define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4972,7 +4308,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4993,13 +4329,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -5014,7 +4350,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -5030,13 +4366,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cbranch_execnz .LBB23_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -5051,7 +4387,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -5073,13 +4409,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -5092,7 +4428,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -5109,13 +4445,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_cbranch_execnz .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -5129,7 +4465,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -5144,13 +4480,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -5164,7 +4500,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -5179,13 +4515,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -5199,7 +4535,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -5215,13 +4551,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5239,7 +4575,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5258,14 +4594,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5283,7 +4619,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5303,7 +4639,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -5311,12 +4647,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst ret half %result } -define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5335,7 +4671,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -5356,13 +4692,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -5378,7 +4714,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -5394,13 +4730,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_cbranch_execnz .LBB24_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -5415,7 +4751,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -5437,13 +4773,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -5456,7 +4792,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -5473,13 +4809,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -5493,7 +4829,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -5508,13 +4844,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -5528,7 +4864,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -5543,13 +4879,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -5563,7 +4899,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -5579,13 +4915,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -5603,7 +4939,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5622,14 +4958,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -5647,7 +4983,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5667,7 +5003,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -5675,12 +5011,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst ret half %result } -define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5698,7 +5034,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5719,12 +5055,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -5737,7 +5073,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5753,12 +5089,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cbranch_execnz .LBB25_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -5772,7 +5108,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5794,12 +5130,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -5811,7 +5147,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5828,12 +5164,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_cbranch_execnz .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -5846,7 +5182,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5861,12 +5197,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -5879,7 +5215,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5894,12 +5230,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -5912,7 +5248,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5928,12 +5264,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5950,7 +5286,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5969,12 +5305,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -5991,7 +5327,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -6011,17 +5347,17 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6040,7 +5376,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6061,12 +5397,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -6081,7 +5417,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6097,12 +5433,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -6117,7 +5453,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6139,12 +5475,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -6157,7 +5493,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6174,12 +5510,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -6193,7 +5529,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6208,12 +5544,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -6227,7 +5563,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6242,12 +5578,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -6261,7 +5597,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6277,12 +5613,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6300,7 +5636,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6319,12 +5655,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6342,7 +5678,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6362,18 +5698,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6392,7 +5728,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6413,12 +5749,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -6434,7 +5770,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6450,12 +5786,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -6470,7 +5806,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6492,12 +5828,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -6510,7 +5846,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6527,12 +5863,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -6546,7 +5882,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6561,12 +5897,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -6580,7 +5916,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6595,12 +5931,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -6614,7 +5950,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6630,12 +5966,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6653,7 +5989,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6672,12 +6008,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6695,7 +6031,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6715,18 +6051,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst ret void } -define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6736,7 +6072,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -6754,20 +6090,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -6781,19 +6117,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -6812,19 +6148,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -6840,20 +6176,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -6866,20 +6202,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -6892,13 +6228,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6906,7 +6242,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6920,12 +6256,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6936,7 +6272,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6953,13 +6289,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6970,7 +6306,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6988,19 +6324,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 ret half %result } -define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7010,7 +6346,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -7028,19 +6364,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7054,18 +6390,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7084,18 +6420,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7111,19 +6447,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_cbranch_execnz .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7136,19 +6472,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7161,12 +6497,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -7174,7 +6510,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -7188,12 +6524,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -7204,7 +6540,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7221,12 +6557,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -7237,7 +6573,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -7255,18 +6591,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 ret void } -define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7285,7 +6621,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7306,13 +6642,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7327,7 +6663,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7343,13 +6679,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7364,7 +6700,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7386,13 +6722,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7405,7 +6741,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7422,13 +6758,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7442,7 +6778,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7459,13 +6795,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7479,7 +6815,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7494,13 +6830,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -7514,7 +6850,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -7530,13 +6866,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7554,7 +6890,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -7573,14 +6909,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7598,7 +6934,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -7618,7 +6954,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -7626,12 +6962,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst ret half %result } -define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7650,7 +6986,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7671,12 +7007,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7691,7 +7027,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7707,12 +7043,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7727,7 +7063,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7749,12 +7085,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7767,7 +7103,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7784,12 +7120,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_cbranch_execnz .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7803,7 +7139,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7820,12 +7156,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7839,7 +7175,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7854,12 +7190,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -7873,7 +7209,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7889,12 +7225,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7912,7 +7248,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7931,12 +7267,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7954,7 +7290,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7974,13 +7310,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst ret void } @@ -7988,8 +7324,8 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; bfloat ; -------------------------------------------------------------------- -define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8006,7 +7342,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8034,13 +7370,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -8054,7 +7390,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8076,13 +7412,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -8096,7 +7432,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8125,13 +7461,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -8143,7 +7479,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8164,13 +7500,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -8184,7 +7520,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8203,13 +7539,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -8223,7 +7559,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8242,13 +7578,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -8261,7 +7597,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8282,13 +7618,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -8305,7 +7641,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -8325,14 +7661,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -8349,7 +7685,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -8369,19 +7705,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst ret bfloat %result } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8400,7 +7736,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8428,13 +7764,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -8450,7 +7786,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8472,13 +7808,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -8494,7 +7830,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8523,13 +7859,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -8542,7 +7878,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8563,13 +7899,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -8584,7 +7920,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8603,13 +7939,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -8624,7 +7960,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8643,13 +7979,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -8663,7 +7999,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8684,13 +8020,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8708,7 +8044,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8728,14 +8064,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8753,7 +8089,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8774,7 +8110,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8782,12 +8118,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8806,7 +8142,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8834,13 +8170,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -8857,7 +8193,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8879,13 +8215,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_cbranch_execnz .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8901,7 +8237,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8930,13 +8266,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8949,7 +8285,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8970,13 +8306,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8991,7 +8327,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -9010,13 +8346,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -9031,7 +8367,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -9050,13 +8386,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -9070,7 +8406,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -9091,13 +8427,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9115,7 +8451,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9135,14 +8471,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9160,7 +8496,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -9181,7 +8517,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -9189,12 +8525,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } -define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9211,7 +8547,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9238,12 +8574,12 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -9257,7 +8593,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9279,12 +8615,12 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -9298,7 +8634,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9326,12 +8662,12 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -9343,7 +8679,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9364,12 +8700,12 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_cbranch_execnz .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -9383,7 +8719,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9402,12 +8738,12 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -9421,7 +8757,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9440,12 +8776,12 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -9458,7 +8794,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9479,12 +8815,12 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -9501,7 +8837,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9521,12 +8857,12 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -9543,7 +8879,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -9564,17 +8900,17 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9593,7 +8929,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9620,12 +8956,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -9641,7 +8977,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9663,12 +8999,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cbranch_execnz .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9684,7 +9020,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9712,12 +9048,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9730,7 +9066,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9751,12 +9087,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9771,7 +9107,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9790,12 +9126,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9810,7 +9146,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9829,12 +9165,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -9848,7 +9184,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9869,12 +9205,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9892,7 +9228,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9912,12 +9248,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9935,7 +9271,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9956,18 +9292,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9986,7 +9322,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10013,12 +9349,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -10035,7 +9371,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10057,12 +9393,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cbranch_execnz .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -10078,7 +9414,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10106,12 +9442,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -10124,7 +9460,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10145,12 +9481,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -10165,7 +9501,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10184,12 +9520,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -10204,7 +9540,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10223,12 +9559,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cbranch_execnz .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -10242,7 +9578,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10263,12 +9599,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -10286,7 +9622,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10306,12 +9642,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -10329,7 +9665,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10350,18 +9686,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst ret void } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10371,7 +9707,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -10396,13 +9732,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10410,7 +9746,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -10431,20 +9767,20 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_cbranch_execnz .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -10470,19 +9806,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -10503,13 +9839,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10517,7 +9853,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -10536,13 +9872,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10550,7 +9886,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -10569,13 +9905,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -10583,7 +9919,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -10603,12 +9939,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -10619,7 +9955,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10637,13 +9973,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10654,7 +9990,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10673,19 +10009,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 ret bfloat %result } -define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10695,7 +10031,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10719,12 +10055,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10732,7 +10068,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10753,19 +10089,19 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cbranch_execnz .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10790,18 +10126,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10822,12 +10158,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10835,7 +10171,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10854,12 +10190,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10867,7 +10203,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10886,12 +10222,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -10899,7 +10235,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10919,12 +10255,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -10935,7 +10271,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10953,12 +10289,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10969,7 +10305,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10988,18 +10324,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 ret void } -define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11018,7 +10354,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -11046,13 +10382,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -11068,7 +10404,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -11090,13 +10426,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_cbranch_execnz .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -11112,7 +10448,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -11141,13 +10477,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -11160,7 +10496,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -11181,13 +10517,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -11202,7 +10538,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -11223,13 +10559,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -11244,7 +10580,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -11263,13 +10599,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -11283,7 +10619,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -11304,13 +10640,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11328,7 +10664,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -11348,14 +10684,14 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11373,7 +10709,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -11394,7 +10730,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -11402,12 +10738,12 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst ret bfloat %result } -define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11426,7 +10762,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11453,12 +10789,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -11474,7 +10810,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11496,12 +10832,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cbranch_execnz .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -11517,7 +10853,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11545,12 +10881,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_cbranch_execnz .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -11563,7 +10899,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11584,12 +10920,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_cbranch_execnz .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -11604,7 +10940,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11625,12 +10961,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -11645,7 +10981,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11664,12 +11000,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -11683,7 +11019,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11704,12 +11040,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11727,7 +11063,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11747,12 +11083,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11770,7 +11106,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11791,13 +11127,13 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst ret void } @@ -11805,8 +11141,8 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11816,7 +11152,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11831,19 +11167,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11857,19 +11193,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_cbranch_execnz .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11885,19 +11221,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11911,19 +11247,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11935,19 +11271,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11959,20 +11295,20 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -11987,13 +11323,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12010,7 +11346,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -12035,14 +11371,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12059,7 +11395,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -12085,19 +11421,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12107,7 +11443,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -12122,19 +11458,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -12148,19 +11484,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cbranch_execnz .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -12176,19 +11512,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -12202,19 +11538,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -12226,19 +11562,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -12250,13 +11586,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -12265,7 +11601,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -12280,12 +11616,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12302,7 +11638,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -12327,14 +11663,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12351,7 +11687,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -12377,7 +11713,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -12385,12 +11721,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12400,7 +11736,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -12415,19 +11751,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -12441,19 +11777,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_cbranch_execnz .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -12469,19 +11805,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -12495,19 +11831,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -12519,19 +11855,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -12543,13 +11879,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -12558,7 +11894,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -12573,12 +11909,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -12599,7 +11935,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12624,12 +11960,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12650,7 +11986,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12676,18 +12012,18 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } -define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12697,7 +12033,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12712,18 +12048,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12737,18 +12073,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cbranch_execnz .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12764,18 +12100,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12789,18 +12125,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12812,18 +12148,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12835,19 +12171,19 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cbranch_execnz .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12862,12 +12198,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12884,7 +12220,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12909,12 +12245,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12931,7 +12267,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12957,17 +12293,17 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12977,7 +12313,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12992,18 +12328,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13017,18 +12353,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cbranch_execnz .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13044,18 +12380,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13069,18 +12405,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13092,18 +12428,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13115,12 +12451,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cbranch_execnz .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -13129,7 +12465,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -13144,12 +12480,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13166,7 +12502,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13191,12 +12527,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13213,7 +12549,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13239,18 +12575,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13260,7 +12596,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -13275,18 +12611,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13300,18 +12636,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cbranch_execnz .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13327,18 +12663,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_cbranch_execnz .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13352,18 +12688,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_cbranch_execnz .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13375,18 +12711,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13398,12 +12734,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cbranch_execnz .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -13412,7 +12748,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -13427,12 +12763,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -13453,7 +12789,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13478,12 +12814,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -13504,7 +12840,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13530,18 +12866,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst ret void } -define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13551,7 +12887,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -13566,19 +12902,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -13592,19 +12928,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_cbranch_execnz .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -13620,19 +12956,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -13646,19 +12982,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13672,19 +13008,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13696,13 +13032,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -13711,7 +13047,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -13726,12 +13062,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13748,7 +13084,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13773,14 +13109,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13797,7 +13133,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13823,7 +13159,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -13831,12 +13167,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst ret <2 x half> %result } -define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13846,7 +13182,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -13861,18 +13197,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13886,18 +13222,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13913,18 +13249,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13938,18 +13274,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13963,18 +13299,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13986,12 +13322,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -14000,7 +13336,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -14015,12 +13351,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14037,7 +13373,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -14062,12 +13398,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14084,7 +13420,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -14110,13 +13446,13 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst ret void } @@ -14124,8 +13460,8 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14136,7 +13472,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14168,13 +13504,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -14183,7 +13519,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -14210,13 +13546,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -14225,7 +13561,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14258,21 +13594,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14299,13 +13635,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -14314,7 +13650,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14339,13 +13675,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -14354,7 +13690,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14379,20 +13715,20 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -14420,13 +13756,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14442,7 +13778,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14464,14 +13800,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14487,7 +13823,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14510,19 +13846,19 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14533,7 +13869,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14565,13 +13901,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14580,7 +13916,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -14607,13 +13943,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -14622,7 +13958,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14655,21 +13991,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14696,13 +14032,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14711,7 +14047,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14736,13 +14072,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14751,7 +14087,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14776,13 +14112,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -14791,7 +14127,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14819,12 +14155,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14840,7 +14176,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14862,14 +14198,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14885,7 +14221,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14908,7 +14244,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -14916,12 +14252,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14932,7 +14268,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14964,13 +14300,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14979,7 +14315,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -15006,13 +14342,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_cbranch_execnz .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -15021,7 +14357,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -15054,21 +14390,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -15095,13 +14431,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15110,7 +14446,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -15135,13 +14471,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15150,7 +14486,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -15175,13 +14511,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -15190,7 +14526,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -15218,12 +14554,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -15243,7 +14579,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -15265,12 +14601,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -15290,7 +14626,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -15313,18 +14649,18 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15335,7 +14671,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15366,12 +14702,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -15380,7 +14716,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15407,12 +14743,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -15421,7 +14757,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15453,20 +14789,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15493,12 +14829,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -15507,7 +14843,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15532,12 +14868,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -15546,7 +14882,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15571,19 +14907,19 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15611,12 +14947,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15632,7 +14968,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15654,12 +14990,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15675,7 +15011,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15698,17 +15034,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15719,7 +15055,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15750,12 +15086,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15764,7 +15100,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15791,12 +15127,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15805,7 +15141,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15837,20 +15173,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15877,12 +15213,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15891,7 +15227,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15916,12 +15252,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15930,7 +15266,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15955,12 +15291,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB58_1 +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -15969,7 +15305,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15997,12 +15333,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16018,7 +15354,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16040,12 +15376,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16061,7 +15397,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16084,18 +15420,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB58_1 +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16106,7 +15442,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16137,12 +15473,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -16151,7 +15487,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16178,12 +15514,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -16192,7 +15528,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16224,20 +15560,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16264,12 +15600,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -16278,7 +15614,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16303,12 +15639,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -16317,7 +15653,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16342,12 +15678,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB59_1 +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -16356,7 +15692,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16384,12 +15720,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -16409,7 +15745,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16431,12 +15767,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -16456,7 +15792,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16479,18 +15815,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB59_1 +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } -define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16501,7 +15837,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -16533,13 +15869,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16548,7 +15884,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -16575,13 +15911,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-NEXT: s_cbranch_execnz .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -16590,7 +15926,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -16623,21 +15959,21 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16664,13 +16000,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16679,7 +16015,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16706,13 +16042,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16721,7 +16057,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16746,13 +16082,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB60_1 +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -16761,7 +16097,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16789,12 +16125,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16810,7 +16146,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16832,14 +16168,14 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16855,7 +16191,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16878,7 +16214,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB60_1 +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -16886,12 +16222,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result } -define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16902,7 +16238,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16933,12 +16269,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16947,7 +16283,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16974,12 +16310,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -16988,7 +16324,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17020,20 +16356,20 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17060,12 +16396,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -17074,7 +16410,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17101,12 +16437,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -17115,7 +16451,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17140,12 +16476,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB61_1 +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -17154,7 +16490,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17182,12 +16518,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -17203,7 +16539,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17225,12 +16561,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -17246,7 +16582,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17269,17 +16605,15 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB61_1 +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst ret void } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } - -!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll index 0612383c3f90b..1f0ae39082865 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll @@ -6,25 +6,25 @@ ; FIXME: This will still fail for gfx6/7 and gfx10 subtargets. ; DISASSEMBLY-VI: .long 0xdd348000 // {{[0-9A-Z]+}}: DD348000 -; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc // {{[0-9A-Z]+}}: 00000100 +; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v1, v0, v0, vcc // {{[0-9A-Z]+}}: 00020100 define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #0 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[2:3], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GCN-NEXT: s_mov_b64 s[0:1], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: .LBB0_2: diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll index 38b9c5df7faa1..c790187f9d108 100644 --- a/llvm/test/CodeGen/AMDGPU/global-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll @@ -49,8 +49,8 @@ define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) { ; R600-LABEL: available_externally_test -; GCN-PAL: s_mov_b32 s3, available_externally@abs32@hi -; GCN-PAL: s_mov_b32 s2, available_externally@abs32@lo +; GCN-PAL: s_mov_b32 s1, available_externally@abs32@hi +; GCN-PAL: s_mov_b32 s0, available_externally@abs32@lo define amdgpu_kernel void @available_externally_test(ptr addrspace(1) %out) { %ptr = getelementptr [256 x i32], ptr addrspace(4) @available_externally, i32 0, i32 1 %val = load i32, ptr addrspace(4) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll index b8ecbae3d3114..7f6a3ad5c9346 100644 --- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half8: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v4, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -18,7 +18,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half8: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -28,7 +28,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half8: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -74,7 +74,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half6: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -84,7 +84,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half6: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -94,7 +94,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half6: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -132,7 +132,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half4: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v2, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -144,7 +144,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half4: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half4: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -188,7 +188,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half2: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dword v1, v0, s[0:1] @@ -198,7 +198,7 @@ define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half2: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] @@ -208,7 +208,7 @@ define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll index f709eae990bda..e54cd64798a68 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: test_move_load_address_to_vgpr: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v0, v1, s[0:1] glc @@ -54,7 +54,7 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: test_move_load_address_to_vgpr_d16_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index dac3a3db7b450..9bee539b1e4e5 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,8 +19,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_add_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,12 +32,12 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_add_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -50,8 +50,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -65,14 +65,14 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_add_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s3, -1 +; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s1, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -80,12 +80,12 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_add_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:-4096 +; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -98,8 +98,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -112,8 +112,8 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_add_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -126,12 +126,12 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_add_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:3232 +; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -144,29 +144,29 @@ entry: define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xdeac -; VI-NEXT: s_addc_u32 s1, s3, 0xabcd +; VI-NEXT: s_add_u32 s0, s0, 0xdeac +; VI-NEXT: s_addc_u32 s1, s1, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -174,12 +174,12 @@ define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_add_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -195,8 +195,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -213,29 +213,29 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -251,9 +251,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -268,18 +268,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_add_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -287,12 +287,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_add_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -310,9 +310,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -332,22 +332,22 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_add_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -356,12 +356,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_add_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -381,8 +381,8 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -394,8 +394,8 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_add_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,12 +407,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_add_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -424,8 +424,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -442,8 +442,8 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_add_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -460,11 +460,11 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_add_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -479,9 +479,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -496,16 +496,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_add_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -513,12 +513,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_add_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -535,9 +535,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -557,20 +557,20 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -579,12 +579,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -603,8 +603,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -616,8 +616,8 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_and_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,12 +629,12 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_and_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -647,8 +647,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -665,29 +665,29 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -703,9 +703,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -720,18 +720,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_and_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -739,12 +739,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_and_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -762,9 +762,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -784,22 +784,22 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_and_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -808,12 +808,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_and_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -833,8 +833,8 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -846,8 +846,8 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_and_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -859,12 +859,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_and_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -876,8 +876,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -894,8 +894,8 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_and_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -912,11 +912,11 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_and_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -931,9 +931,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -948,16 +948,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_and_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -965,12 +965,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_and_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -987,9 +987,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1009,20 +1009,20 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1031,12 +1031,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1055,8 +1055,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,8 +1068,8 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_sub_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1081,12 +1081,12 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_sub_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1099,8 +1099,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,29 +1117,29 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1155,9 +1155,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1172,18 +1172,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_sub_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1191,12 +1191,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_sub_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1214,9 +1214,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1236,22 +1236,22 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_sub_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1260,12 +1260,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1285,8 +1285,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1298,8 +1298,8 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_sub_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1311,12 +1311,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_sub_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1328,8 +1328,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1346,8 +1346,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_sub_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,11 +1364,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_sub_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1383,9 +1383,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1400,16 +1400,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_sub_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1417,12 +1417,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_sub_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1439,9 +1439,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1461,20 +1461,20 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1483,12 +1483,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1507,8 +1507,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1520,8 +1520,8 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_max_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1533,12 +1533,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_max_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1551,8 +1551,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1568,28 +1568,28 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -1604,9 +1604,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1619,29 +1619,29 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1657,9 +1657,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1678,20 +1678,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1701,12 +1701,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1725,8 +1725,8 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1736,8 +1736,8 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_max_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1747,12 +1747,12 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_max_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -1762,8 +1762,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1779,8 +1779,8 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_max_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1796,11 +1796,11 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_max_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -1814,9 +1814,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1829,27 +1829,27 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1864,9 +1864,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1885,18 +1885,18 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1906,12 +1906,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1929,8 +1929,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1940,8 +1940,8 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1951,12 +1951,12 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_umax_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -1967,8 +1967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1984,28 +1984,28 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2020,9 +2020,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2035,29 +2035,29 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2073,9 +2073,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2094,20 +2094,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2117,12 +2117,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2141,8 +2141,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2152,8 +2152,8 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_umax_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2163,12 +2163,12 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_umax_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2178,8 +2178,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2195,8 +2195,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umax_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2212,11 +2212,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umax_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2230,9 +2230,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2245,27 +2245,27 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2280,9 +2280,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2301,18 +2301,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2322,12 +2322,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2345,8 +2345,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2356,8 +2356,8 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_min_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2367,12 +2367,12 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_min_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2383,8 +2383,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2400,28 +2400,28 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2436,9 +2436,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2451,29 +2451,29 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2489,9 +2489,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2510,20 +2510,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2533,12 +2533,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2557,8 +2557,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2568,8 +2568,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2579,12 +2579,12 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2594,8 +2594,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2611,8 +2611,8 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_min_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2628,11 +2628,11 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_min_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2646,9 +2646,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2661,27 +2661,27 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_min_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2696,9 +2696,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2717,18 +2717,18 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2738,12 +2738,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2761,8 +2761,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2772,8 +2772,8 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2783,12 +2783,12 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_umin_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2799,8 +2799,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2816,28 +2816,28 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2852,9 +2852,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2867,29 +2867,29 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umin_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2905,9 +2905,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2926,20 +2926,20 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umin_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2949,12 +2949,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2973,8 +2973,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,8 +2984,8 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_umin_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,12 +2995,12 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_umin_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3010,8 +3010,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3027,8 +3027,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umin_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3044,11 +3044,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umin_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -3062,9 +3062,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3077,27 +3077,27 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3112,9 +3112,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3133,18 +3133,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3154,12 +3154,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3177,8 +3177,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3190,8 +3190,8 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_or_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3203,12 +3203,12 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_or_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3221,8 +3221,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3239,29 +3239,29 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3277,9 +3277,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3294,18 +3294,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; VI-LABEL: atomic_or_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3313,12 +3313,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; GFX9-LABEL: atomic_or_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3336,9 +3336,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3358,22 +3358,22 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; ; VI-LABEL: atomic_or_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3382,12 +3382,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX9-LABEL: atomic_or_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3407,8 +3407,8 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3420,8 +3420,8 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_or_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3433,12 +3433,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_or_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3450,8 +3450,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3468,8 +3468,8 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: atomic_or_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3486,11 +3486,11 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: atomic_or_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3505,9 +3505,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3522,16 +3522,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_or_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3539,12 +3539,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_or_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3561,9 +3561,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3583,20 +3583,20 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3605,12 +3605,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3629,8 +3629,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3642,8 +3642,8 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3655,12 +3655,12 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3673,8 +3673,8 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %in) { ; SI-LABEL: atomic_xchg_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3686,8 +3686,8 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; VI-LABEL: atomic_xchg_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3699,12 +3699,12 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; GFX9-LABEL: atomic_xchg_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3717,8 +3717,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3735,29 +3735,29 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3773,9 +3773,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3790,18 +3790,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_xchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3809,12 +3809,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_xchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3832,9 +3832,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3854,22 +3854,22 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3878,12 +3878,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3903,8 +3903,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3916,8 +3916,8 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3929,12 +3929,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3946,8 +3946,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3964,8 +3964,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_xchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3982,11 +3982,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_xchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4001,9 +4001,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4018,16 +4018,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4035,12 +4035,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -4057,9 +4057,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4079,20 +4079,20 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4101,12 +4101,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -4125,7 +4125,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4140,7 +4140,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4155,7 +4155,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4173,8 +4173,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4192,31 +4192,31 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4233,10 +4233,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s7, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4252,19 +4252,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s7, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dword s6, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4273,17 +4273,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4298,10 +4298,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s10, s[0:1], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s10, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4309,8 +4309,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 glc @@ -4322,24 +4322,24 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s9, s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x44 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4348,13 +4348,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 @@ -4376,7 +4376,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4391,7 +4391,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; VI-LABEL: atomic_cmpxchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4406,7 +4406,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; GFX9-LABEL: atomic_cmpxchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4423,8 +4423,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4442,8 +4442,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: atomic_cmpxchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4461,12 +4461,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4482,10 +4482,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s7, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4501,17 +4501,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s7, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dword s6, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4520,17 +4520,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4544,10 +4544,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s10, s[0:1], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s10, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4555,8 +4555,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 glc @@ -4568,22 +4568,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s9, s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x44 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4592,13 +4592,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 @@ -4619,8 +4619,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4632,8 +4632,8 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_xor_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4645,12 +4645,12 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_xor_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4663,8 +4663,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4681,29 +4681,29 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4719,9 +4719,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4736,18 +4736,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_xor_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4755,12 +4755,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_xor_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -4778,9 +4778,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4800,22 +4800,22 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_xor_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4824,12 +4824,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -4849,8 +4849,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4862,8 +4862,8 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xor_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,12 +4875,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xor_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] +; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4892,8 +4892,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4910,8 +4910,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_xor_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4928,11 +4928,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_xor_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4947,9 +4947,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4964,16 +4964,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_xor_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4981,12 +4981,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_xor_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -5003,9 +5003,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5025,20 +5025,20 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -5047,12 +5047,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -5071,7 +5071,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5087,7 +5087,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5105,7 +5105,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5123,7 +5123,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5141,7 +5141,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i32_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5159,7 +5159,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i32_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc @@ -5177,7 +5177,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,7 +5193,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5211,7 +5211,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5229,7 +5229,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5245,7 +5245,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; VI-LABEL: atomic_load_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5261,7 +5261,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -5278,8 +5278,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5298,8 +5298,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5320,11 +5320,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5343,8 +5343,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5363,8 +5363,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5383,11 +5383,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -5405,8 +5405,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5425,8 +5425,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5447,11 +5447,11 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5470,8 +5470,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5481,25 +5481,25 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -5510,8 +5510,8 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5521,23 +5521,23 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -5547,8 +5547,8 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5558,23 +5558,23 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: store atomic float %in, ptr addrspace(1) %out seq_cst, align 4 @@ -5584,8 +5584,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5598,8 +5598,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; ; VI-LABEL: atomic_store_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5614,14 +5614,14 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; ; GFX9-LABEL: atomic_store_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5634,8 +5634,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5648,8 +5648,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; ; VI-LABEL: atomic_store_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5664,14 +5664,14 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; ; GFX9-LABEL: atomic_store_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5684,8 +5684,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s8, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dword s8, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 @@ -5699,8 +5699,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5713,14 +5713,14 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5732,8 +5732,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s8, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dword s8, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 @@ -5747,8 +5747,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; ; VI-LABEL: atomic_store_f32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5761,14 +5761,14 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; ; GFX9-LABEL: atomic_store_f32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5780,7 +5780,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5796,7 +5796,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; VI-LABEL: atomic_load_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5812,7 +5812,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; GFX9-LABEL: atomic_load_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc @@ -5830,7 +5830,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5848,7 +5848,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; VI-LABEL: atomic_load_i8_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5866,7 +5866,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; GFX9-LABEL: atomic_load_i8_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc @@ -5884,8 +5884,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5895,25 +5895,25 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) ; ; VI-LABEL: atomic_store_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %out, i64 16 @@ -5924,8 +5924,8 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5935,23 +5935,23 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1 @@ -5961,7 +5961,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5977,7 +5977,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5993,7 +5993,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -6011,7 +6011,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -6029,7 +6029,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i16_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6047,7 +6047,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i16_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc @@ -6065,8 +6065,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6076,25 +6076,25 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %out, i64 8 @@ -6105,8 +6105,8 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6116,23 +6116,23 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6142,8 +6142,8 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6153,25 +6153,25 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %o ; ; VI-LABEL: atomic_store_f16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr half, ptr addrspace(1) %out, i64 8 @@ -6182,8 +6182,8 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6193,23 +6193,23 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: store atomic half %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6219,8 +6219,8 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6230,25 +6230,25 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) ; ; VI-LABEL: atomic_store_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8 store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2 @@ -6258,8 +6258,8 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6269,23 +6269,23 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) ; ; VI-LABEL: atomic_store_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2 ret void @@ -6294,8 +6294,8 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,8 +6307,8 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_inc_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6320,12 +6320,12 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_inc_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6338,8 +6338,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -6353,14 +6353,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s3, -1 +; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s1, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6368,12 +6368,12 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_inc_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:-4096 +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6386,8 +6386,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6400,8 +6400,8 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_inc_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6414,12 +6414,12 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_inc_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:3232 +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6432,29 +6432,29 @@ entry: define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xdeac -; VI-NEXT: s_addc_u32 s1, s3, 0xabcd +; VI-NEXT: s_add_u32 s0, s0, 0xdeac +; VI-NEXT: s_addc_u32 s1, s1, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6462,12 +6462,12 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_inc_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6482,8 +6482,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_inc_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6500,29 +6500,29 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_inc_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6538,9 +6538,9 @@ entry: define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -6555,18 +6555,18 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_inc_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6574,12 +6574,12 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_inc_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -6597,9 +6597,9 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -6619,22 +6619,22 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_inc_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -6643,12 +6643,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_inc_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -6668,8 +6668,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6681,8 +6681,8 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_dec_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6694,12 +6694,12 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_dec_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6712,8 +6712,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -6727,14 +6727,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_dec_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s3, -1 +; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s1, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6742,12 +6742,12 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_dec_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:-4096 +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6760,8 +6760,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6774,8 +6774,8 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_dec_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6788,12 +6788,12 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_dec_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:3232 +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6806,29 +6806,29 @@ entry: define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 0xdeac -; VI-NEXT: s_addc_u32 s1, s3, 0xabcd +; VI-NEXT: s_add_u32 s0, s0, 0xdeac +; VI-NEXT: s_addc_u32 s1, s1, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6836,12 +6836,12 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_dec_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6856,8 +6856,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_dec_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6874,29 +6874,29 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_dec_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6912,9 +6912,9 @@ entry: define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dword s6, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -6929,18 +6929,18 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_dec_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6948,12 +6948,12 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_dec_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -6971,9 +6971,9 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf +; SI-NEXT: s_load_dword s2, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -6993,22 +6993,22 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_dec_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -7017,12 +7017,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_dec_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -7042,7 +7042,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7058,7 +7058,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7074,7 +7074,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -7091,7 +7091,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7109,7 +7109,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_f16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7127,7 +7127,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_f16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc @@ -7144,7 +7144,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7160,7 +7160,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; VI-LABEL: atomic_load_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7176,7 +7176,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; GFX9-LABEL: atomic_load_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -7193,7 +7193,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7211,7 +7211,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; VI-LABEL: atomic_load_bf16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7229,7 +7229,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_bf16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 516c92f1640ea..a7ba8a084272b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4616,7 +4616,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4648,7 +4648,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4679,7 +4679,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -4714,8 +4714,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4753,8 +4753,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -4789,24 +4789,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc +; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -4829,7 +4829,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4861,7 +4861,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4890,7 +4890,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -4924,8 +4924,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4963,8 +4963,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -4997,24 +4997,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -5869,7 +5869,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -5901,7 +5901,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -5932,7 +5932,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -5967,8 +5967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6006,8 +6006,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -6042,24 +6042,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc +; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -6082,8 +6082,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6121,8 +6121,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -6155,24 +6155,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -7860,7 +7860,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -7892,7 +7892,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -7923,7 +7923,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -7958,8 +7958,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -7997,8 +7997,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -8033,24 +8033,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc +; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -8073,36 +8073,36 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s3, s[4:5], 0x0 -; SI-NEXT: s_mov_b64 s[0:1], 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dword s2, s[0:1], 0x0 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: .LBB130_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s2, v1 +; SI-NEXT: v_min_i32_e32 v0, s6, v1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -8126,24 +8126,24 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -8155,8 +8155,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -8194,8 +8194,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -8228,24 +8228,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: s_ashr_i32 s3, s1, 31 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 68482ca3eaf87..3bf52a56fef5b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_add_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -20,7 +20,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_add_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -33,7 +33,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_add_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -45,7 +45,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -62,8 +62,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_add_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -81,8 +81,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,12 +100,12 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -115,8 +115,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -137,8 +137,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -154,8 +154,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_add_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -173,12 +173,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_add_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -190,8 +190,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_add_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -212,7 +212,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -233,7 +233,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_add_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -256,7 +256,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -272,7 +272,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -297,7 +297,7 @@ entry: define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_add_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -312,7 +312,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_add_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -327,7 +327,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_add_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -339,7 +339,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -355,8 +355,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_add_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -374,8 +374,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_add_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -393,12 +393,12 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_add_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -408,8 +408,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_add_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -429,8 +429,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -446,8 +446,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_add_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -463,12 +463,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_add_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -480,8 +480,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_add_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -501,7 +501,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -522,7 +522,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -543,7 +543,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -559,7 +559,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -583,7 +583,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_and_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -596,7 +596,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_and_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -609,7 +609,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_and_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -621,7 +621,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -638,8 +638,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_and_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -657,8 +657,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -676,12 +676,12 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -691,8 +691,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -713,8 +713,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -730,8 +730,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_and_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -749,12 +749,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_and_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -766,8 +766,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_and_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -788,7 +788,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -809,7 +809,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_and_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -832,7 +832,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -848,7 +848,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -873,7 +873,7 @@ entry: define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_and_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -888,7 +888,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_and_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -903,7 +903,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_and_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -915,7 +915,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -931,8 +931,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_and_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -950,8 +950,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_and_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -969,12 +969,12 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_and_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -984,8 +984,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_and_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1005,8 +1005,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1022,8 +1022,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_and_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1039,12 +1039,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_and_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1056,8 +1056,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_and_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1077,7 +1077,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1119,7 +1119,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1135,7 +1135,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1159,7 +1159,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_sub_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_sub_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_sub_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1197,7 +1197,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1214,8 +1214,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_sub_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1233,8 +1233,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1252,12 +1252,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1267,8 +1267,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1289,8 +1289,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1306,8 +1306,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_sub_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1325,12 +1325,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_sub_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1342,8 +1342,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_sub_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1364,7 +1364,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1385,7 +1385,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_sub_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1424,7 +1424,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1449,7 +1449,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_sub_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1464,7 +1464,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_sub_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_sub_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1491,7 +1491,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1507,8 +1507,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_sub_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1526,8 +1526,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_sub_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1545,12 +1545,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_sub_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1560,8 +1560,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_sub_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1581,8 +1581,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1598,8 +1598,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_sub_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1615,12 +1615,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_sub_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1632,8 +1632,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_sub_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1653,7 +1653,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1674,7 +1674,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1695,7 +1695,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1711,7 +1711,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1735,7 +1735,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_max_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_max_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1757,7 +1757,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_max_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1767,7 +1767,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1784,8 +1784,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_max_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1802,8 +1802,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1820,12 +1820,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -1834,8 +1834,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1856,8 +1856,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1871,8 +1871,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1888,12 +1888,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1903,8 +1903,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_max_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1925,7 +1925,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1945,7 +1945,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1967,7 +1967,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1982,7 +1982,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2007,7 +2007,7 @@ entry: define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_max_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2020,7 +2020,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_max_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2033,7 +2033,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_max_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2043,7 +2043,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2059,8 +2059,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_max_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2077,8 +2077,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_max_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2095,12 +2095,12 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_max_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2109,8 +2109,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_max_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2130,8 +2130,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2145,8 +2145,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_max_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2160,12 +2160,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_max_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2175,8 +2175,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_max_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2196,7 +2196,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2216,7 +2216,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2236,7 +2236,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2251,7 +2251,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2275,7 +2275,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umax_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2286,7 +2286,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umax_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2297,7 +2297,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umax_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2307,7 +2307,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2324,8 +2324,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umax_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2342,8 +2342,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2360,12 +2360,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2374,8 +2374,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2396,8 +2396,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2411,8 +2411,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2428,12 +2428,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2443,8 +2443,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_umax_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2465,7 +2465,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2485,7 +2485,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2507,7 +2507,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2522,7 +2522,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2547,7 +2547,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umax_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2560,7 +2560,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_umax_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2573,7 +2573,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_umax_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2583,7 +2583,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2599,8 +2599,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umax_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2617,8 +2617,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umax_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2635,12 +2635,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umax_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2649,8 +2649,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: atomic_umax_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2670,8 +2670,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2685,8 +2685,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umax_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2700,12 +2700,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umax_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2715,8 +2715,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_umax_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2736,7 +2736,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2756,7 +2756,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2776,7 +2776,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2791,7 +2791,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2815,7 +2815,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_min_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2826,7 +2826,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_min_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2837,7 +2837,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_min_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2847,7 +2847,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2864,8 +2864,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_min_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2882,8 +2882,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2900,12 +2900,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2914,8 +2914,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2936,8 +2936,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2951,8 +2951,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2968,12 +2968,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2983,8 +2983,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_min_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3005,7 +3005,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3025,7 +3025,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3047,7 +3047,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3062,7 +3062,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3087,7 +3087,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_min_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3100,7 +3100,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_min_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3113,7 +3113,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3123,7 +3123,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3139,8 +3139,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_min_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3157,8 +3157,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_min_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3175,12 +3175,12 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_min_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -3189,8 +3189,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_min_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3210,8 +3210,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3225,8 +3225,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_min_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3240,12 +3240,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_min_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -3255,8 +3255,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_min_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3276,7 +3276,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3296,7 +3296,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3316,7 +3316,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3331,7 +3331,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3355,7 +3355,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umin_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3366,7 +3366,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umin_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3377,7 +3377,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umin_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3387,7 +3387,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3404,8 +3404,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umin_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3422,8 +3422,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3440,12 +3440,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -3454,8 +3454,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3476,8 +3476,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3491,8 +3491,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umin_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3508,12 +3508,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umin_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -3523,8 +3523,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_umin_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3545,7 +3545,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3565,7 +3565,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umin_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3587,7 +3587,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3602,7 +3602,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3627,7 +3627,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umin_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3640,7 +3640,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_umin_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3653,7 +3653,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_umin_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3663,7 +3663,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3679,8 +3679,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umin_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3697,8 +3697,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umin_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3715,12 +3715,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umin_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -3729,8 +3729,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: atomic_umin_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3750,8 +3750,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3765,8 +3765,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umin_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3780,12 +3780,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umin_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -3795,8 +3795,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_umin_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3816,7 +3816,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3836,7 +3836,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3871,7 +3871,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3895,7 +3895,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_or_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3908,7 +3908,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_or_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3921,7 +3921,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_or_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3933,7 +3933,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3950,8 +3950,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_or_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3969,8 +3969,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3988,12 +3988,12 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4003,8 +4003,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4025,8 +4025,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4042,8 +4042,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; ; VI-LABEL: atomic_or_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4061,12 +4061,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; ; GFX9-LABEL: atomic_or_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -4078,8 +4078,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; GFX12-LABEL: atomic_or_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4100,7 +4100,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4121,7 +4121,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; VI-LABEL: atomic_or_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4144,7 +4144,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX9-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -4160,7 +4160,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4185,7 +4185,7 @@ entry: define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_or_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4200,7 +4200,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_or_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4215,7 +4215,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_or_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4227,7 +4227,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4243,8 +4243,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_or_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4262,8 +4262,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: atomic_or_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4281,12 +4281,12 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: atomic_or_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4296,8 +4296,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; GFX12-LABEL: atomic_or_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4317,8 +4317,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4334,8 +4334,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_or_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4351,12 +4351,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_or_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -4368,8 +4368,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_or_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4389,7 +4389,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4410,7 +4410,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4431,7 +4431,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -4447,7 +4447,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4471,7 +4471,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xchg_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4484,7 +4484,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_xchg_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4497,7 +4497,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_xchg_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4509,7 +4509,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4526,7 +4526,7 @@ entry: define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double %in) { ; CI-LABEL: atomic_xchg_f64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4539,7 +4539,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; VI-LABEL: atomic_xchg_f64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4552,7 +4552,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; GFX9-LABEL: atomic_xchg_f64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4564,7 +4564,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4581,7 +4581,7 @@ entry: define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr %in) { ; CI-LABEL: atomic_xchg_pointer_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4594,7 +4594,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_pointer_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4607,7 +4607,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_pointer_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4619,7 +4619,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4636,8 +4636,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xchg_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4655,8 +4655,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4674,12 +4674,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4689,8 +4689,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4711,8 +4711,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4728,8 +4728,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_xchg_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4747,12 +4747,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_xchg_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -4764,8 +4764,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4786,7 +4786,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4807,7 +4807,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_xchg_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4830,7 +4830,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -4846,7 +4846,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4871,7 +4871,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xchg_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4886,7 +4886,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_xchg_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4901,7 +4901,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_xchg_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4913,7 +4913,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4929,8 +4929,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xchg_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4948,8 +4948,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_xchg_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4967,12 +4967,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_xchg_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4982,8 +4982,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: atomic_xchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5003,8 +5003,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5020,8 +5020,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_xchg_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5037,12 +5037,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_xchg_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -5054,8 +5054,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_xchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5075,7 +5075,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5096,7 +5096,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5117,7 +5117,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5133,7 +5133,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5157,7 +5157,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xor_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -5170,7 +5170,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_xor_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5183,7 +5183,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_xor_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -5195,7 +5195,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -5212,8 +5212,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xor_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5231,8 +5231,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5250,12 +5250,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5265,8 +5265,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5287,8 +5287,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5304,8 +5304,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_xor_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5323,12 +5323,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_xor_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -5340,8 +5340,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_xor_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5362,7 +5362,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5383,7 +5383,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_xor_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5406,7 +5406,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5422,7 +5422,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5447,7 +5447,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xor_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5462,7 +5462,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_xor_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5477,7 +5477,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_xor_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -5489,7 +5489,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -5505,8 +5505,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xor_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5524,8 +5524,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_xor_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5543,12 +5543,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_xor_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5558,8 +5558,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_xor_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5579,8 +5579,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5596,8 +5596,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_xor_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5613,12 +5613,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_xor_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -5630,8 +5630,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_xor_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5651,7 +5651,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5672,7 +5672,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5693,7 +5693,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5709,7 +5709,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5733,50 +5733,50 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s8, s4 +; CI-NEXT: s_mov_b32 s9, s5 ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s8 -; CI-NEXT: v_mov_b32_e32 v3, s9 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5785,8 +5785,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 ; GFX12-LABEL: atomic_cmpxchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 @@ -5804,52 +5804,52 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_soffset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s2, 0x11940 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, 0x11940 +; CI-NEXT: s_mov_b32 s8, s4 +; CI-NEXT: s_mov_b32 s9, s5 ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s8 -; CI-NEXT: v_mov_b32_e32 v3, s9 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s4 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], s2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, 0x11940 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, 0x11940 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s4 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], s2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x11000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] offset:2368 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5858,8 +5858,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 ; GFX12-LABEL: atomic_cmpxchg_i64_soffset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 @@ -5877,7 +5877,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5897,7 +5897,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5917,7 +5917,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5932,7 +5932,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -5955,7 +5955,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5974,7 +5974,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_cmpxchg_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 @@ -5994,7 +5994,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6011,7 +6011,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 @@ -6033,32 +6033,32 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; CI-NEXT: s_mov_b32 s15, 0xf000 +; CI-NEXT: s_mov_b32 s14, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b64 s[10:11], s[10:11], 3 -; CI-NEXT: v_mov_b32_e32 v4, s10 -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: s_mov_b32 s12, s6 +; CI-NEXT: s_mov_b32 s13, s7 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_mov_b32 s7, s15 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_mov_b32_e32 v2, s12 -; CI-NEXT: v_mov_b32_e32 v3, s13 -; CI-NEXT: v_mov_b32_e32 v5, s11 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -6083,9 +6083,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX9-NEXT: s_add_u32 s2, s4, s2 @@ -6103,8 +6103,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 @@ -6131,50 +6131,50 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s8, s4 +; CI-NEXT: s_mov_b32 s9, s5 ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s8 -; CI-NEXT: v_mov_b32_e32 v3, s9 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6183,8 +6183,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6 ; GFX12-LABEL: atomic_cmpxchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 @@ -6201,7 +6201,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6221,7 +6221,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: atomic_cmpxchg_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6241,7 +6241,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -6256,7 +6256,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -6278,7 +6278,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6297,7 +6297,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; VI-LABEL: atomic_cmpxchg_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 @@ -6315,7 +6315,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; GFX9-LABEL: atomic_cmpxchg_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6332,7 +6332,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 @@ -6353,32 +6353,32 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; CI-NEXT: s_mov_b32 s15, 0xf000 +; CI-NEXT: s_mov_b32 s14, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b64 s[10:11], s[10:11], 3 -; CI-NEXT: v_mov_b32_e32 v4, s10 -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: s_mov_b32 s12, s6 +; CI-NEXT: s_mov_b32 s13, s7 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_mov_b32 s7, s15 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_mov_b32_e32 v2, s12 -; CI-NEXT: v_mov_b32_e32 v3, s13 -; CI-NEXT: v_mov_b32_e32 v5, s11 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; VI-NEXT: s_add_u32 s2, s4, s2 @@ -6401,9 +6401,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX9-NEXT: s_add_u32 s2, s4, s2 @@ -6421,8 +6421,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 @@ -6448,7 +6448,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6464,7 +6464,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6482,7 +6482,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc @@ -6493,7 +6493,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT @@ -6513,7 +6513,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64_neg_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: v_not_b32_e32 v0, 31 ; CI-NEXT: v_mov_b32_e32 v1, -1 @@ -6531,7 +6531,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; VI-LABEL: atomic_load_i64_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6549,7 +6549,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_i64_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc @@ -6560,7 +6560,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; GFX12-LABEL: atomic_load_i64_neg_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT @@ -6580,7 +6580,7 @@ entry: define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6596,7 +6596,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; VI-LABEL: atomic_load_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6612,7 +6612,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc @@ -6623,7 +6623,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT @@ -6642,8 +6642,8 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6662,8 +6662,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6684,11 +6684,11 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc @@ -6700,8 +6700,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-LABEL: atomic_load_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6725,28 +6725,28 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 -; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s8, s6 +; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 s9, s7 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_load_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6765,11 +6765,11 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc @@ -6781,8 +6781,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; GFX12-LABEL: atomic_load_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6805,8 +6805,8 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_f64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6825,8 +6825,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_f64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6847,11 +6847,11 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_f64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc @@ -6863,8 +6863,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-LABEL: atomic_load_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6888,7 +6888,7 @@ entry: define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_store_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6901,7 +6901,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s2, 32 @@ -6914,7 +6914,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -6924,7 +6924,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -6941,7 +6941,7 @@ entry: define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_store_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6954,7 +6954,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -6965,7 +6965,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: atomic_store_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -6975,7 +6975,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -6991,8 +6991,8 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7007,8 +7007,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; ; VI-LABEL: atomic_store_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7024,12 +7024,12 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; ; GFX9-LABEL: atomic_store_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -7039,8 +7039,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -7061,24 +7061,24 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[8:11], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7092,12 +7092,12 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -7107,8 +7107,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -7128,8 +7128,8 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_f64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7144,8 +7144,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; ; VI-LABEL: atomic_store_f64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7161,12 +7161,12 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; ; GFX9-LABEL: atomic_store_f64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -7176,8 +7176,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -7198,7 +7198,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_inc_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -7211,7 +7211,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_inc_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -7224,7 +7224,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_inc_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -7236,7 +7236,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -7253,8 +7253,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_inc_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7272,8 +7272,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_inc_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7291,12 +7291,12 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_inc_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7306,8 +7306,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -7328,8 +7328,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_inc_i64_incr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7345,8 +7345,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_inc_i64_incr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7364,12 +7364,12 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_inc_i64_incr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -7381,8 +7381,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_inc_i64_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -7403,7 +7403,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_dec_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -7416,7 +7416,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_dec_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -7429,7 +7429,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_dec_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -7441,7 +7441,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -7458,8 +7458,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_dec_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7477,8 +7477,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7496,12 +7496,12 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_dec_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7511,8 +7511,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -7533,8 +7533,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_dec_i64_decr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7550,8 +7550,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_dec_i64_decr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7569,12 +7569,12 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_dec_i64_decr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -7586,8 +7586,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_dec_i64_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index cafd35afea6eb..8897ad3e950a5 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -4866,8 +4866,8 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -4905,8 +4905,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -4941,15 +4941,15 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4981,7 +4981,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -5025,7 +5025,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -5064,7 +5064,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -5107,8 +5107,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -5146,8 +5146,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_max_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s4, s0, s4 @@ -5180,15 +5180,15 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_max_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5219,7 +5219,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -5300,7 +5300,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6328,8 +6328,8 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -6367,8 +6367,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6403,15 +6403,15 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6443,7 +6443,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -6487,7 +6487,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6526,7 +6526,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6569,7 +6569,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -6613,7 +6613,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -6650,7 +6650,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8664,8 +8664,8 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -8703,8 +8703,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8739,15 +8739,15 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8779,7 +8779,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -8823,7 +8823,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8862,7 +8862,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8905,7 +8905,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: atomic_min_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: s_mov_b64 s[8:9], 0 @@ -8942,7 +8942,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_min_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 @@ -8972,7 +8972,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9006,7 +9006,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -9050,7 +9050,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -9087,7 +9087,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 04df04a5c299b..ab32efc4d3cd8 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -18,15 +18,15 @@ declare double @div.double.value() define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,23 +54,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -118,22 +118,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -141,30 +141,30 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -172,20 +172,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -193,23 +193,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -225,23 +225,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -257,22 +257,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -280,30 +280,30 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1164-DPP-NEXT: .LBB0_2: ; GFX1164-DPP-NEXT: s_nop 0 ; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -311,20 +311,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1132-DPP-NEXT: .LBB0_2: ; GFX1132-DPP-NEXT: s_nop 0 ; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1054,24 +1054,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ret void } -define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1101,27 +1101,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1143,33 +1143,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1183,31 +1183,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1216,19 +1216,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -1240,7 +1240,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1265,14 +1265,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: .LBB2_3: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -1282,14 +1282,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1298,34 +1298,34 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s14, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1347,33 +1347,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1387,31 +1387,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1420,19 +1420,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1444,7 +1444,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1469,14 +1469,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: .LBB2_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1486,14 +1486,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1502,9 +1502,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -1513,8 +1513,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope } -define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -1564,7 +1564,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1630,7 +1630,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1696,7 +1696,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1761,7 +1761,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 @@ -1822,7 +1822,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 @@ -1882,7 +1882,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1968,7 +1968,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -2050,7 +2050,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -2126,7 +2126,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 @@ -2208,7 +2208,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 @@ -2289,21 +2289,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2335,25 +2335,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2377,31 +2377,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2417,29 +2417,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2448,19 +2448,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -2472,7 +2472,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2499,12 +2499,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -2514,14 +2514,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2530,34 +2530,34 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s14, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2581,31 +2581,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2621,29 +2621,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2652,19 +2652,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -2676,7 +2676,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2703,12 +2703,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -2718,14 +2718,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2734,9 +2734,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3467,8 +3467,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ } -define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -3518,7 +3518,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3584,7 +3584,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB6_5: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3650,7 +3650,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_5: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3715,7 +3715,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: .LBB6_5: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 @@ -3763,7 +3763,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: .LBB6_4: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 @@ -3810,7 +3810,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB6_4: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3896,7 +3896,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3978,7 +3978,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -4054,7 +4054,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 @@ -4123,7 +4123,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: .LBB6_2: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 @@ -4191,21 +4191,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4237,25 +4237,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4279,31 +4279,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4319,29 +4319,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4350,19 +4350,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -4374,7 +4374,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4401,12 +4401,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -4416,14 +4416,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4432,34 +4432,34 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s14, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4483,31 +4483,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4523,29 +4523,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4554,19 +4554,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -4578,7 +4578,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4605,12 +4605,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -4620,14 +4620,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4636,9 +4636,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -5422,891 +5422,1708 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX7LESS-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 +; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 +; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v4, v3 ; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], v[41:42] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 -; GFX1132-NEXT: .LBB9_3: +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-NEXT: .LBB9_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[3:4], v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8 + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 ret void } define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 ; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s38, -1 -; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 -; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s38, -1 -; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 -; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s38, -1 -; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() - %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 ret void } -define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -6339,27 +7156,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: .LBB11_3: ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6382,25 +7199,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-NEXT: .LBB11_3: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -6423,32 +7240,32 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-NEXT: .LBB11_3: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 +; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6457,19 +7274,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -6481,7 +7298,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -6507,14 +7324,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: .LBB11_3: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -6524,14 +7341,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6540,34 +7357,34 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s14, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6590,25 +7407,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -6631,32 +7448,32 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 +; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6665,19 +7482,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -6689,7 +7506,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -6715,14 +7532,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -6732,14 +7549,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6748,9 +7565,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -6758,8 +7575,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ret void } -define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -6812,7 +7629,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -6858,7 +7675,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -6904,7 +7721,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -6950,7 +7767,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b32 s14, s8 ; GFX1164-NEXT: s_add_u32 s8, s2, 44 @@ -6987,7 +7804,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-NEXT: s_addc_u32 s9, s3, 0 @@ -7022,7 +7839,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7068,7 +7885,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7114,7 +7931,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7160,7 +7977,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 @@ -7197,7 +8014,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 @@ -7243,17 +8060,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -7288,25 +8105,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -7331,23 +8148,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -7372,30 +8189,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 +; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7404,19 +8221,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -7428,7 +8245,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -7456,12 +8273,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -7471,14 +8288,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7487,34 +8304,34 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s14, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7539,23 +8356,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -7580,30 +8397,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 +; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7612,19 +8429,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -7636,7 +8453,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -7664,12 +8481,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -7679,14 +8496,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7695,9 +8512,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -8183,8 +9000,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ret void } -define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -8237,7 +9054,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8283,7 +9100,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8329,7 +9146,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8375,7 +9192,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b32 s14, s8 ; GFX1164-NEXT: s_add_u32 s8, s2, 44 @@ -8412,7 +9229,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-NEXT: s_addc_u32 s9, s3, 0 @@ -8447,7 +9264,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8493,7 +9310,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8539,7 +9356,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8585,7 +9402,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 @@ -8622,7 +9439,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 @@ -8664,962 +9481,1732 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 +; GFX7LESS-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] -; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 -; GFX1164-NEXT: scratch_store_b32 off, v1, off -; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-NEXT: s_mov_b32 s44, 0 ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 -; GFX1132-NEXT: scratch_store_b32 off, v1, off -; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB16_3 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off -; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off -; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 -; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 8 + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 ret void } define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 ; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s38, -1 -; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 -; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s38, -1 -; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 -; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s38, -1 -; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp - %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 8 + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 4 ret void } define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB18_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -9647,23 +11234,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB18_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -9679,23 +11266,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB18_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -9711,22 +11298,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB18_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -9734,33 +11321,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1032-NEXT: .LBB18_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB18_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9778,23 +11365,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB18_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9803,32 +11390,32 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1132-NEXT: .LBB18_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -9844,23 +11431,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -9876,22 +11463,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -9899,33 +11486,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1032-DPP-NEXT: .LBB18_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9943,23 +11530,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9968,9 +11555,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1132-DPP-NEXT: .LBB18_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -9981,15 +11568,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB19_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -10017,23 +11604,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB19_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -10049,23 +11636,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB19_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -10081,22 +11668,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB19_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -10104,33 +11691,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1032-NEXT: .LBB19_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB19_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10148,23 +11735,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB19_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10173,32 +11760,32 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1132-NEXT: .LBB19_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -10214,23 +11801,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -10246,22 +11833,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -10269,33 +11856,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1032-DPP-NEXT: .LBB19_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10313,23 +11900,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10338,9 +11925,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1132-DPP-NEXT: .LBB19_3: ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 005cd3a0021b3..a13e704a1a5fc 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -21,10 +21,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,10 +54,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -83,10 +83,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -98,10 +98,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -112,13 +112,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -131,12 +131,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -179,10 +179,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -194,10 +194,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -208,13 +208,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -227,12 +227,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -976,10 +976,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -1009,10 +1009,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1038,10 +1038,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1065,17 +1065,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1084,8 +1084,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm @@ -1093,13 +1093,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1125,18 +1125,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1146,9 +1146,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm @@ -1158,10 +1158,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1187,10 +1187,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1214,17 +1214,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1233,8 +1233,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -1242,13 +1242,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1274,18 +1274,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1295,9 +1295,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -2158,10 +2158,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -2191,10 +2191,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2220,10 +2220,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2247,17 +2247,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2266,8 +2266,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm @@ -2275,13 +2275,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2307,18 +2307,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2328,9 +2328,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm @@ -2340,10 +2340,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2369,10 +2369,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2396,17 +2396,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2415,8 +2415,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -2424,13 +2424,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2456,18 +2456,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2477,9 +2477,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3335,739 +3335,1653 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] -; GFX1064-NEXT: .LBB6_2: +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] -; GFX1032-NEXT: .LBB6_2: +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] -; GFX1064-DPP-NEXT: .LBB6_2: +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] -; GFX1032-DPP-NEXT: .LBB6_2: +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8 + %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 ret void } define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s38, -1 -; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 -; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s38, -1 -; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 -; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s38, -1 -; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 ret void } @@ -4077,10 +4991,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -4114,10 +5028,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4145,10 +5059,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -4174,18 +5088,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4195,8 +5109,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-NEXT: .LBB8_3: ; GFX1032-NEXT: s_endpgm @@ -4204,13 +5118,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -4238,18 +5152,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4259,9 +5173,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm @@ -4271,10 +5185,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4302,10 +5216,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -4331,18 +5245,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4352,8 +5266,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -4361,13 +5275,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -4395,18 +5309,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4416,9 +5330,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -4933,859 +5847,1653 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 8 + %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 ret void } define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s38, -1 -; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 -; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s38, -1 -; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 -; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s38, -1 -; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 8 + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 4 ret void } @@ -5795,10 +7503,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -5828,10 +7536,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5857,10 +7565,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -5884,17 +7592,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -5903,8 +7611,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-NEXT: .LBB12_3: ; GFX1032-NEXT: s_endpgm @@ -5912,13 +7620,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -5944,18 +7652,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5965,9 +7673,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm @@ -5977,10 +7685,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6006,10 +7714,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -6033,17 +7741,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -6052,8 +7760,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -6061,13 +7769,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -6093,18 +7801,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6114,9 +7822,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -6130,10 +7838,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -6163,10 +7871,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6192,10 +7900,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -6219,17 +7927,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -6238,8 +7946,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm @@ -6247,13 +7955,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -6279,18 +7987,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6300,9 +8008,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm @@ -6312,10 +8020,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6341,10 +8049,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -6368,17 +8076,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -6387,8 +8095,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -6396,13 +8104,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -6428,18 +8136,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6449,9 +8157,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 3f4779f08e42f..65d0b9eafdf82 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -21,10 +21,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,10 +54,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -83,10 +83,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -98,10 +98,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -112,13 +112,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -131,12 +131,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -179,10 +179,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -194,10 +194,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -208,13 +208,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -227,12 +227,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -976,10 +976,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -1009,10 +1009,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1038,10 +1038,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1065,17 +1065,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1084,8 +1084,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm @@ -1093,13 +1093,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1125,18 +1125,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1146,9 +1146,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm @@ -1158,10 +1158,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1187,10 +1187,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1214,17 +1214,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1233,8 +1233,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -1242,13 +1242,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1274,18 +1274,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1295,9 +1295,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -2158,10 +2158,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -2191,10 +2191,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2220,10 +2220,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2247,17 +2247,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2266,8 +2266,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm @@ -2275,13 +2275,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2307,18 +2307,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2328,9 +2328,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm @@ -2340,10 +2340,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2369,10 +2369,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2396,17 +2396,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2415,8 +2415,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -2424,13 +2424,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2456,18 +2456,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2477,9 +2477,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3335,739 +3335,1653 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] -; GFX1064-NEXT: .LBB6_2: +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] -; GFX1032-NEXT: .LBB6_2: +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] -; GFX1064-DPP-NEXT: .LBB6_2: +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] -; GFX1032-DPP-NEXT: .LBB6_2: +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8 + %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 ret void } define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s38, -1 -; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 -; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s38, -1 -; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 -; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s38, -1 -; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 ret void } @@ -4077,10 +4991,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -4114,10 +5028,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4145,10 +5059,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -4174,18 +5088,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4195,8 +5109,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-NEXT: .LBB8_3: ; GFX1032-NEXT: s_endpgm @@ -4204,13 +5118,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -4238,18 +5152,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4259,9 +5173,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm @@ -4271,10 +5185,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4302,10 +5216,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -4331,18 +5245,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -4352,8 +5266,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -4361,13 +5275,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -4395,18 +5309,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4416,9 +5330,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -4933,859 +5847,1653 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 8 + %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 ret void } define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s38, -1 -; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 -; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s38, -1 -; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 -; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s38, -1 -; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 8 + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 4 ret void } @@ -5795,10 +7503,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -5828,10 +7536,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5857,10 +7565,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -5884,17 +7592,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -5903,8 +7611,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-NEXT: .LBB12_3: ; GFX1032-NEXT: s_endpgm @@ -5912,13 +7620,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -5944,18 +7652,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5965,9 +7673,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm @@ -5977,10 +7685,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6006,10 +7714,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -6033,17 +7741,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -6052,8 +7760,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -6061,13 +7769,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -6093,18 +7801,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6114,9 +7822,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -6130,10 +7838,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -6163,10 +7871,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6192,10 +7900,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -6219,17 +7927,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -6238,8 +7946,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm @@ -6247,13 +7955,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -6279,18 +7987,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6300,9 +8008,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm @@ -6312,10 +8020,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6341,10 +8049,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -6368,17 +8076,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -6387,8 +8095,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -6396,13 +8104,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -6428,18 +8136,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6449,9 +8157,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 64650e2733a00..2bba8d4f43b1a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -18,15 +18,15 @@ declare double @div.double.value() define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,23 +54,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -118,22 +118,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -141,33 +141,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -185,23 +185,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -210,32 +210,32 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -251,23 +251,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -283,22 +283,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -306,33 +306,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -350,23 +350,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -375,9 +375,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1132-DPP-NEXT: .LBB0_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -1158,24 +1158,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ret void } -define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1205,27 +1205,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1247,33 +1247,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1287,31 +1287,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1320,19 +1320,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -1344,7 +1344,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1369,14 +1369,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: .LBB2_3: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -1386,14 +1386,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1402,34 +1402,34 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s14, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1451,33 +1451,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1491,31 +1491,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1524,19 +1524,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1548,7 +1548,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1573,14 +1573,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: .LBB2_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1590,14 +1590,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1606,9 +1606,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -1617,8 +1617,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope } -define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -1668,7 +1668,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1734,7 +1734,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1800,7 +1800,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1865,7 +1865,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 @@ -1926,7 +1926,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 @@ -1986,7 +1986,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -2072,7 +2072,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -2154,7 +2154,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -2230,7 +2230,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 @@ -2312,7 +2312,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 @@ -2393,21 +2393,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2439,25 +2439,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2481,31 +2481,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2521,29 +2521,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2552,19 +2552,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -2576,7 +2576,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2603,12 +2603,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -2618,14 +2618,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2634,34 +2634,34 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s14, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2685,31 +2685,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2725,29 +2725,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2756,19 +2756,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -2780,7 +2780,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2807,12 +2807,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -2822,14 +2822,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2838,9 +2838,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3623,8 +3623,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ } -define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -3674,7 +3674,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3740,7 +3740,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB6_5: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3806,7 +3806,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_5: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3871,7 +3871,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: .LBB6_5: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 @@ -3932,7 +3932,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: .LBB6_5: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 @@ -3992,7 +3992,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB6_5: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -4078,7 +4078,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -4160,7 +4160,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -4236,7 +4236,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 @@ -4318,7 +4318,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 @@ -4399,21 +4399,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4445,25 +4445,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4487,31 +4487,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4527,29 +4527,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4558,19 +4558,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -4582,7 +4582,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4609,12 +4609,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -4624,14 +4624,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4640,34 +4640,34 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s14, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4691,31 +4691,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4731,29 +4731,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4762,19 +4762,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -4786,7 +4786,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4813,12 +4813,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -4828,14 +4828,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4844,9 +4844,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -5630,891 +5630,1708 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX7LESS-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 +; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 +; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v4, v3 ; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], -v[41:42] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-NEXT: .LBB9_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[3:4], -v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8 + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 ret void } define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 ; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s38, -1 -; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 -; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s38, -1 -; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 -; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s38, -1 -; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() - %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 ret void } -define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -6547,27 +7364,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: .LBB11_3: ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6590,25 +7407,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-NEXT: .LBB11_3: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -6631,32 +7448,32 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-NEXT: .LBB11_3: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 +; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6665,19 +7482,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -6689,7 +7506,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -6715,14 +7532,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: .LBB11_3: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -6732,14 +7549,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6748,34 +7565,34 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s14, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6798,25 +7615,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -6839,32 +7656,32 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 +; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6873,19 +7690,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -6897,7 +7714,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -6923,14 +7740,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -6940,14 +7757,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6956,17 +7773,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic ret void } -define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -7019,7 +7836,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7065,7 +7882,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7111,7 +7928,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7157,7 +7974,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b32 s14, s8 ; GFX1164-NEXT: s_add_u32 s8, s2, 44 @@ -7194,7 +8011,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-NEXT: s_addc_u32 s9, s3, 0 @@ -7229,7 +8046,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7275,7 +8092,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7321,7 +8138,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7367,7 +8184,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 @@ -7404,7 +8221,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 @@ -7450,17 +8267,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -7495,25 +8312,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -7538,23 +8355,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -7579,30 +8396,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 +; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -7611,19 +8428,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -7635,7 +8452,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -7663,12 +8480,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -7678,14 +8495,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7694,34 +8511,34 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s14, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7746,23 +8563,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -7787,30 +8604,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 +; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -7819,19 +8636,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -7843,7 +8660,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -7871,12 +8688,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -7886,14 +8703,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7902,9 +8719,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -8390,8 +9207,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ret void } -define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -8444,7 +9261,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8490,7 +9307,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8536,7 +9353,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8582,7 +9399,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b32 s14, s8 ; GFX1164-NEXT: s_add_u32 s8, s2, 44 @@ -8619,7 +9436,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-NEXT: s_addc_u32 s9, s3, 0 @@ -8654,7 +9471,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8700,7 +9517,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8746,7 +9563,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8792,7 +9609,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 @@ -8829,7 +9646,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 @@ -8870,947 +9687,1717 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 +; GFX7LESS-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] -; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] -; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 -; GFX1164-NEXT: scratch_store_b32 off, v1, off -; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-NEXT: s_mov_b32 s44, 0 ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 -; GFX1132-NEXT: scratch_store_b32 off, v1, off -; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB16_3 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s10, -1 -; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 -; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off -; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off -; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 -; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 monotonic, align 8 + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 ret void } define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 ; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-NEXT: s_cbranch_execnz .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s38, -1 -; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 -; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s38, -1 -; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 -; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s38, -1 -; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp - %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 8 + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll index 3b71e8ffefbf8..e41634402c0c2 100644 --- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll @@ -71,7 +71,7 @@ bb: ; uniform load dominated by no-alias store - scalarize ; CHECK-LABEL: @no_memdep_alias_arg -; CHECK: s_load_dwordx2 s[[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[4:5], 0x0 +; CHECK: s_load_dwordx2 s[[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[6:7], 0x0 ; CHECK: s_load_dword [[SVAL:s[0-9]+]], s[[[IN_LO]]:[[IN_HI]]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] @@ -100,7 +100,7 @@ define amdgpu_kernel void @memdep(ptr addrspace(1) %in, [8 x i32], ptr addrspace ; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]] ; CHECK-DAG: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0 ; CHECK-DAG: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 -; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 +; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[6:7], 0x0 ; CHECK-DAG: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index e2d55990473c0..3735c6349fbb3 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: load_f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -19,8 +19,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; ; VI-LABEL: load_f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -31,10 +31,10 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; GFX11-LABEL: load_f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -46,8 +46,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: load_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -57,8 +57,8 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; ; VI-LABEL: load_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -69,10 +69,10 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; GFX11-LABEL: load_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -84,7 +84,7 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CIVI-LABEL: load_v3f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 @@ -100,7 +100,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; ; GFX11-LABEL: load_v3f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 @@ -119,7 +119,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CIVI-LABEL: load_v4f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 @@ -130,7 +130,7 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -145,12 +145,12 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: load_v8f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: v_mov_b32_e32 v5, s5 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -159,12 +159,12 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; ; VI-LABEL: load_v8f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -174,8 +174,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; GFX11-LABEL: load_v8f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -191,8 +191,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: extload_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -204,8 +204,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; ; VI-LABEL: extload_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -218,13 +218,13 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; GFX11-LABEL: extload_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -237,8 +237,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -248,8 +248,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -260,11 +260,11 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; GFX11-LABEL: extload_f16_to_f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -277,8 +277,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -290,8 +290,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -304,13 +304,13 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; GFX11-LABEL: extload_v2f16_to_v2f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -323,7 +323,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 @@ -336,7 +336,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -349,7 +349,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -368,7 +368,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: s_lshr_b32 s5, s2, 16 @@ -383,7 +383,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s3, 16 @@ -419,8 +419,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 ; CI-NEXT: s_lshr_b32 s7, s0, 16 @@ -447,8 +447,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 ; VI-NEXT: s_lshr_b32 s7, s0, 16 @@ -476,8 +476,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; GFX11-LABEL: extload_v8f16_to_v8f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s8, s7, 16 @@ -506,10 +506,10 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -519,10 +519,10 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x8 +; VI-NEXT: s_load_dword s0, s[6:7], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -532,14 +532,14 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; GFX11-LABEL: extload_f16_to_f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -552,12 +552,12 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -568,12 +568,12 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x8 +; VI-NEXT: s_load_dword s0, s[6:7], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -584,17 +584,17 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; GFX11-LABEL: extload_v2f16_to_v2f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 +; GFX11-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -607,7 +607,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 @@ -628,7 +628,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 @@ -649,7 +649,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -675,7 +675,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -725,7 +725,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s5, s3, 16 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -754,8 +754,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 @@ -801,8 +801,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 @@ -848,22 +848,20 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; GFX11-LABEL: extload_v8f16_to_v8f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s9, s7, 16 ; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-NEXT: s_lshr_b32 s1, s5, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7 ; GFX11-NEXT: v_cvt_f32_f16_e32 v11, s9 -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s0, s4, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s6 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, s8 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v6 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v11 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 @@ -872,7 +870,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 @@ -889,7 +889,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -902,7 +902,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: global_load_store_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -919,7 +919,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v2f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -932,7 +932,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -949,7 +949,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: global_load_store_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -962,7 +962,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; ; GFX11-LABEL: global_load_store_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -979,7 +979,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v8f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -992,7 +992,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f32: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1023,7 +1023,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1042,7 +1042,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v2, s[2:3] @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1128,7 +1128,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v3, s[2:3] @@ -1151,7 +1151,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1187,7 +1187,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] @@ -1212,7 +1212,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] @@ -1300,7 +1300,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -1358,7 +1358,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v20, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f64: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1493,7 +1493,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1528,7 +1528,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v4, s[2:3] @@ -1553,7 +1553,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1602,7 +1602,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] @@ -1631,7 +1631,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] @@ -1718,7 +1718,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1766,7 +1766,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1810,7 +1810,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1947,7 +1947,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2033,7 +2033,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2102,7 +2102,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_truncstore_f32_to_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2116,7 +2116,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; ; GFX11-LABEL: global_truncstore_f32_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2135,7 +2135,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v2f32_to_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2152,7 +2152,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2168,7 +2168,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v2f32_to_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -2190,7 +2190,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v3f32_to_v3f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2213,7 +2213,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2235,7 +2235,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v3f32_to_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b96 v[0:2], v3, s[2:3] @@ -2260,7 +2260,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v4f32_to_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2281,7 +2281,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2300,7 +2300,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v4f32_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2325,7 +2325,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v8f32_to_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2360,7 +2360,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2391,7 +2391,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v8f32_to_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2425,7 +2425,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v16f32_to_v16f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -2494,7 +2494,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -2554,7 +2554,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; GFX11-LABEL: global_truncstore_v16f32_to_v16f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 @@ -2606,12 +2606,12 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 { ; CI-LABEL: fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s0, s[6:7], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2622,8 +2622,8 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2636,13 +2636,13 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; GFX11-LABEL: fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_add_f16_e64 v1, s2, s3 +; GFX11-NEXT: v_add_f16_e64 v1, s4, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2655,7 +2655,7 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x half> %b) #0 { ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2676,7 +2676,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -2693,7 +2693,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; GFX11-LABEL: fadd_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 @@ -2709,7 +2709,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: fadd_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2746,7 +2746,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2765,7 +2765,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX11-LABEL: fadd_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2787,8 +2787,8 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 { ; CI-LABEL: fadd_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s2, s8, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2845,8 +2845,8 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s15, 16 ; VI-NEXT: s_lshr_b32 s3, s11, 16 @@ -2888,8 +2888,8 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; GFX11-LABEL: fadd_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, s7, s11 @@ -2908,7 +2908,7 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: test_bitcast_from_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -2921,7 +2921,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; ; GFX11-LABEL: test_bitcast_from_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2939,7 +2939,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: test_bitcast_to_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2952,7 +2952,7 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_bitcast_to_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll index b6eff8846dc8c..380a8e911e499 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll @@ -98,4 +98,4 @@ bb: ret void } -attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll index 6a49eac134a67..10c5ffd0eb07e 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.bc %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll index 6f4c8911efd33..677584caa8b2e 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.bc %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll index 01f8fbfd76314..1a5a7698e2f96 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.bc %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 7a9f4ae8a20fa..8c017fa5ec263 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -15,7 +15,7 @@ ; CHECK: .max_flat_workgroup_size: 1024 ; CHECK: .name: test ; CHECK: .private_segment_fixed_size: 0 -; CHECK: .sgpr_count: 6 +; CHECK: .sgpr_count: 10 ; CHECK: .symbol: test.kd ; CHECK: .vgpr_count: {{3|6}} ; WAVE64: .wavefront_size: 64 @@ -23,7 +23,7 @@ define amdgpu_kernel void @test( ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) { + ptr addrspace(1) %b) "amdgpu-no-implicitarg-ptr" { entry: %a.val = load half, ptr addrspace(1) %a %b.val = load half, ptr addrspace(1) %b @@ -47,10 +47,10 @@ entry: } ; CHECK: .name: num_spilled_sgprs -; GFX700: .sgpr_spill_count: 12 -; GFX803: .sgpr_spill_count: 12 -; GFX900: .sgpr_spill_count: 48 -; GFX1010: .sgpr_spill_count: 48 +; GFX700: .sgpr_spill_count: 10 +; GFX803: .sgpr_spill_count: 10 +; GFX900: .sgpr_spill_count: 62 +; GFX1010: .sgpr_spill_count: 60 ; CHECK: .symbol: num_spilled_sgprs.kd define amdgpu_kernel void @num_spilled_sgprs( ptr addrspace(1) %out0, ptr addrspace(1) %out1, [8 x i32], @@ -61,27 +61,37 @@ define amdgpu_kernel void @num_spilled_sgprs( ptr addrspace(1) %outa, ptr addrspace(1) %outb, [8 x i32], ptr addrspace(1) %outc, ptr addrspace(1) %outd, [8 x i32], ptr addrspace(1) %oute, ptr addrspace(1) %outf, [8 x i32], + ptr addrspace(1) %outg, ptr addrspace(1) %outh, [8 x i32], + ptr addrspace(1) %outi, ptr addrspace(1) %outj, [8 x i32], + ptr addrspace(1) %outk, ptr addrspace(1) %outl, [8 x i32], + ptr addrspace(1) %outm, ptr addrspace(1) %outn, [8 x i32], i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32], i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32], i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32], - i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 { + i32 %inc, i32 %ind, i32 %ine, i32 %inf, i32 %ing, i32 %inh, + i32 %ini, i32 %inj, i32 %ink) #0 { entry: - store i32 %in0, ptr addrspace(1) %out0 - store i32 %in1, ptr addrspace(1) %out1 - store i32 %in2, ptr addrspace(1) %out2 - store i32 %in3, ptr addrspace(1) %out3 - store i32 %in4, ptr addrspace(1) %out4 - store i32 %in5, ptr addrspace(1) %out5 - store i32 %in6, ptr addrspace(1) %out6 - store i32 %in7, ptr addrspace(1) %out7 - store i32 %in8, ptr addrspace(1) %out8 - store i32 %in9, ptr addrspace(1) %out9 - store i32 %ina, ptr addrspace(1) %outa - store i32 %inb, ptr addrspace(1) %outb - store i32 %inc, ptr addrspace(1) %outc - store i32 %ind, ptr addrspace(1) %outd - store i32 %ine, ptr addrspace(1) %oute - store i32 %inf, ptr addrspace(1) %outf + store volatile i32 %in0, ptr addrspace(1) %out0 + store volatile i32 %in1, ptr addrspace(1) %out1 + store volatile i32 %in2, ptr addrspace(1) %out2 + store volatile i32 %in3, ptr addrspace(1) %out3 + store volatile i32 %in4, ptr addrspace(1) %out4 + store volatile i32 %in5, ptr addrspace(1) %out5 + store volatile i32 %in6, ptr addrspace(1) %out6 + store volatile i32 %in7, ptr addrspace(1) %out7 + store volatile i32 %in8, ptr addrspace(1) %out8 + store volatile i32 %in9, ptr addrspace(1) %out9 + store volatile i32 %ina, ptr addrspace(1) %outa + store volatile i32 %inb, ptr addrspace(1) %outb + store volatile i32 %inc, ptr addrspace(1) %outc + store volatile i32 %ind, ptr addrspace(1) %outd + store volatile i32 %ine, ptr addrspace(1) %oute + store volatile i32 %inf, ptr addrspace(1) %outf + store volatile i32 %ing, ptr addrspace(1) %outg + store volatile i32 %inh, ptr addrspace(1) %outh + store volatile i32 %ini, ptr addrspace(1) %outi + store volatile i32 %inj, ptr addrspace(1) %outj + store volatile i32 %ink, ptr addrspace(1) %outk ret void } @@ -160,7 +170,7 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 { ; CHECK-NEXT: - 1 ; CHECK-NEXT: - 1 -attributes #0 = { "amdgpu-num-sgpr"="14" } +attributes #0 = { "amdgpu-num-sgpr"="20" } attributes #1 = { "amdgpu-num-vgpr"="20" } attributes #2 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll index 689619227b8d7..0db5f01fc0ccc 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor %s -o %t.bc +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll index 9854977c2f308..6eece2c9bf416 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll @@ -1,10 +1,12 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX9 %s - -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=CHECK,GFX9 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes=amdgpu-attributor -o %t.gfx7.bc %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=amdgpu-attributor -o %t.gfx8.bc %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.gfx9.bc %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj < %t.gfx7.bc | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj < %t.gfx8.bc | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.gfx9.bc | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %t.gfx7.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %t.gfx8.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.gfx9.bc | FileCheck --check-prefixes=CHECK,GFX9 %s ; On gfx8, the queue ptr is required for this addrspacecast. diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll index cf26a427aec32..acf829c4d3c72 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=amdgpu-attributor -o %t.bc %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %t.bc | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll index 7986368e2a358..03242b69beb8c 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll @@ -31,14 +31,14 @@ ; GFX10: .sgpr_spill_count: 0 ; GFX10: .vgpr_count: 4 ; GFX10: .vgpr_spill_count: 0 -define amdgpu_kernel void @test1(ptr %x) { +define amdgpu_kernel void @test1(ptr %x) #1 { %1 = load volatile float, ptr %x %2 = call float @f(float %1) store volatile float %2, ptr %x ret void } -define internal float @f(float %arg0) #0 { +define internal float @f(float %arg0) #1 { %stack = alloca float, i32 4, align 4, addrspace(5) store volatile float 3.0, ptr addrspace(5) %stack %val = load volatile float, ptr addrspace(5) %stack @@ -135,6 +135,7 @@ define amdgpu_kernel void @test4() { } attributes #0 = { norecurse } +attributes #1 = { norecurse "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index de484677bf5e6..487e62b6c3495 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -106,7 +106,7 @@ ; HSA: .Lfunc_end0: ; HSA: .size simple, .Lfunc_end0-simple -define amdgpu_kernel void @simple(ptr addrspace(1) %out) { +define amdgpu_kernel void @simple(ptr addrspace(1) %out) #0 { entry: store i32 0, ptr addrspace(1) %out ret void @@ -114,11 +114,13 @@ entry: ; HSA-LABEL: {{^}}simple_no_kernargs: ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -define amdgpu_kernel void @simple_no_kernargs() { +define amdgpu_kernel void @simple_no_kernargs() #0 { entry: store volatile i32 0, ptr addrspace(1) undef ret void } +attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 7ee31bf4dce7c..78653d7e21ad8 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: udiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -52,9 +52,10 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; ; GFX10-LABEL: udiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_sub_i32 s2, 0, s6 @@ -100,8 +101,8 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: udiv32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -171,9 +172,9 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: urem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -215,9 +216,10 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; ; GFX10-LABEL: urem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_sub_i32 s2, 0, s6 @@ -261,8 +263,8 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: urem32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -331,14 +333,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s2, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s5, 0, s2 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: s_abs_i32 s4, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_ashr_i32 s3, s5, 31 +; GFX9-NEXT: s_sub_i32 s5, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -349,70 +351,70 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5 -; GFX9-NEXT: s_mul_i32 s7, s6, s2 -; GFX9-NEXT: s_sub_i32 s7, s3, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s2, s5 +; GFX9-NEXT: s_mul_i32 s7, s6, s4 +; GFX9-NEXT: s_sub_i32 s7, s2, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_sub_i32 s9, s7, s2 -; GFX9-NEXT: s_cmp_ge_u32 s7, s2 +; GFX9-NEXT: s_sub_i32 s9, s7, s4 +; GFX9-NEXT: s_cmp_ge_u32 s7, s4 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 ; GFX9-NEXT: s_cselect_b32 s7, s9, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s2 +; GFX9-NEXT: s_cmp_ge_u32 s7, s4 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_xor_b32 s6, s6, s4 -; GFX9-NEXT: s_sub_i32 s6, s6, s4 -; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_sub_i32 s6, s6, s3 +; GFX9-NEXT: s_add_i32 s2, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s5, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_abs_i32 s2, s3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX10-NEXT: s_sub_i32 s4, 0, s2 -; GFX10-NEXT: s_ashr_i32 s3, s3, 31 +; GFX10-NEXT: s_abs_i32 s4, s5 +; GFX10-NEXT: s_ashr_i32 s2, s5, 31 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s3, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s5, v0 +; GFX10-NEXT: v_readfirstlane_b32 s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s4, s4, s5 -; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_add_i32 s5, s5, s6 +; GFX10-NEXT: s_mul_i32 s3, s3, s6 +; GFX10-NEXT: s_mul_hi_u32 s5, s6, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_add_i32 s5, s6, s5 ; GFX10-NEXT: .LBB2_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5 -; GFX10-NEXT: s_mul_i32 s7, s6, s2 +; GFX10-NEXT: s_mul_hi_u32 s6, s3, s5 +; GFX10-NEXT: s_mul_i32 s7, s6, s4 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_sub_i32 s7, s4, s7 -; GFX10-NEXT: s_sub_i32 s9, s7, s2 -; GFX10-NEXT: s_cmp_ge_u32 s7, s2 +; GFX10-NEXT: s_sub_i32 s7, s3, s7 +; GFX10-NEXT: s_sub_i32 s9, s7, s4 +; GFX10-NEXT: s_cmp_ge_u32 s7, s4 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_cselect_b32 s7, s9, s7 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_cmp_ge_u32 s7, s2 +; GFX10-NEXT: s_cmp_ge_u32 s7, s4 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_add_i32 s4, s4, 1 -; GFX10-NEXT: s_xor_b32 s6, s6, s3 -; GFX10-NEXT: s_sub_i32 s6, s6, s3 +; GFX10-NEXT: s_add_i32 s3, s3, 1 +; GFX10-NEXT: s_xor_b32 s6, s6, s2 +; GFX10-NEXT: s_sub_i32 s6, s6, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -420,51 +422,51 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: sdiv32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_abs_i32 s2, s3 -; GFX11-NEXT: s_ashr_i32 s3, s3, 31 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX11-NEXT: s_sub_i32 s4, 0, s2 +; GFX11-NEXT: s_abs_i32 s4, s5 +; GFX11-NEXT: s_ashr_i32 s2, s5, 31 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX11-NEXT: s_sub_i32 s3, 0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: v_readfirstlane_b32 s6, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s4, s4, s5 +; GFX11-NEXT: s_mul_i32 s3, s3, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_add_i32 s5, s5, s6 +; GFX11-NEXT: s_mul_hi_u32 s5, s6, s3 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_add_i32 s5, s6, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s6, s4, s5 -; GFX11-NEXT: s_mul_i32 s7, s6, s2 +; GFX11-NEXT: s_mul_hi_u32 s6, s3, s5 +; GFX11-NEXT: s_mul_i32 s7, s6, s4 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_sub_i32 s7, s4, s7 +; GFX11-NEXT: s_sub_i32 s7, s3, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s9, s7, s2 -; GFX11-NEXT: s_cmp_ge_u32 s7, s2 +; GFX11-NEXT: s_sub_i32 s9, s7, s4 +; GFX11-NEXT: s_cmp_ge_u32 s7, s4 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 ; GFX11-NEXT: s_cselect_b32 s7, s9, s7 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_cmp_ge_u32 s7, s2 +; GFX11-NEXT: s_cmp_ge_u32 s7, s4 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 -; GFX11-NEXT: s_add_i32 s4, s4, 1 -; GFX11-NEXT: s_xor_b32 s6, s6, s3 +; GFX11-NEXT: s_add_i32 s3, s3, 1 +; GFX11-NEXT: s_xor_b32 s6, s6, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s6, s3 +; GFX11-NEXT: s_sub_i32 s6, s6, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -490,125 +492,126 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s2, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_abs_i32 s4, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_sub_i32 s3, 0, s4 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s3, s3, s5 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_add_i32 s3, s5, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 -; GFX9-NEXT: s_mul_i32 s5, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s3, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s2 -; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s3 +; GFX9-NEXT: s_mul_i32 s5, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s2, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s2 -; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_sub_i32 s6, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s2, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: srem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_abs_i32 s2, s2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: s_abs_i32 s4, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s2, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s3, s3, s4 -; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3 -; GFX10-NEXT: s_mov_b32 s3, 0 -; GFX10-NEXT: s_add_i32 s4, s4, s5 +; GFX10-NEXT: s_mul_i32 s2, s2, s3 +; GFX10-NEXT: s_mul_hi_u32 s5, s3, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_add_i32 s3, s3, s5 ; GFX10-NEXT: .LBB3_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4 -; GFX10-NEXT: s_mul_i32 s5, s5, s2 -; GFX10-NEXT: s_sub_i32 s5, s3, s5 -; GFX10-NEXT: s_sub_i32 s6, s5, s2 -; GFX10-NEXT: s_cmp_ge_u32 s5, s2 +; GFX10-NEXT: s_mul_hi_u32 s5, s2, s3 +; GFX10-NEXT: s_mul_i32 s5, s5, s4 +; GFX10-NEXT: s_sub_i32 s5, s2, s5 +; GFX10-NEXT: s_sub_i32 s6, s5, s4 +; GFX10-NEXT: s_cmp_ge_u32 s5, s4 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_sub_i32 s6, s5, s2 -; GFX10-NEXT: s_cmp_ge_u32 s5, s2 +; GFX10-NEXT: s_sub_i32 s6, s5, s4 +; GFX10-NEXT: s_cmp_ge_u32 s5, s4 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s3, s3, 1 +; GFX10-NEXT: s_add_i32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 ; GFX10-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: srem32_invariant_denom: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_abs_i32 s2, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX11-NEXT: s_sub_i32 s3, 0, s2 +; GFX11-NEXT: s_abs_i32 s4, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX11-NEXT: s_sub_i32 s2, 0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s3, s3, s4 -; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3 -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_add_i32 s4, s4, s5 +; GFX11-NEXT: s_mul_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_hi_u32 s5, s3, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_add_i32 s3, s3, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB3_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s3, s4 -; GFX11-NEXT: s_mul_i32 s5, s5, s2 +; GFX11-NEXT: s_mul_hi_u32 s5, s2, s3 +; GFX11-NEXT: s_mul_i32 s5, s5, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s5, s3, s5 -; GFX11-NEXT: s_sub_i32 s6, s5, s2 -; GFX11-NEXT: s_cmp_ge_u32 s5, s2 +; GFX11-NEXT: s_sub_i32 s5, s2, s5 +; GFX11-NEXT: s_sub_i32 s6, s5, s4 +; GFX11-NEXT: s_cmp_ge_u32 s5, s4 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s5, s2 -; GFX11-NEXT: s_cmp_ge_u32 s5, s2 +; GFX11-NEXT: s_sub_i32 s6, s5, s4 +; GFX11-NEXT: s_cmp_ge_u32 s5, s4 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5 -; GFX11-NEXT: s_add_i32 s3, s3, 1 +; GFX11-NEXT: s_add_i32 s2, s2, 1 ; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -634,14 +637,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX9-NEXT: s_movk_i32 s4, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -655,7 +658,6 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v5, v4, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -664,12 +666,12 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: udiv16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s4, 0xffff +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 @@ -685,7 +687,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v6, s0 -; GFX10-NEXT: global_store_short v5, v4, s[2:3] +; GFX10-NEXT: global_store_short v5, v4, s[4:5] ; GFX10-NEXT: s_cbranch_vccz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -693,11 +695,11 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: udiv16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s0, s4, 0xffff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -746,14 +748,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: urem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_and_b32 s4, s0, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_movk_i32 s2, 0x400 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -765,10 +767,11 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 ; GFX9-NEXT: v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v5, v4, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -777,13 +780,13 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: urem16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s4, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -795,10 +798,10 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_mul_lo_u32 v4, v4, s0 +; GFX10-NEXT: v_mul_lo_u32 v4, v4, s2 ; GFX10-NEXT: v_sub_nc_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX10-NEXT: global_store_short v5, v4, s[2:3] +; GFX10-NEXT: global_store_short v5, v4, s[0:1] ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 @@ -807,11 +810,11 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: urem16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_and_b32 s2, s4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -862,19 +865,19 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: sdiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s3, 0x400 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_movk_i32 s2, 0x400 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 +; GFX9-NEXT: s_sext_i32_i16 s5, s3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX9-NEXT: s_xor_b32 s6, s5, s2 +; GFX9-NEXT: s_xor_b32 s6, s5, s4 ; GFX9-NEXT: s_ashr_i32 s5, s6, 30 ; GFX9-NEXT: s_or_b32 s5, s5, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -883,14 +886,15 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: v_add_u16_e64 v2, s4, 1 +; GFX9-NEXT: v_add_u16_e64 v2, s3, 1 ; GFX9-NEXT: s_cselect_b32 s5, s5, 0 -; GFX9-NEXT: s_and_b32 s6, 0xffff, s4 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s3 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, s5, v4 ; GFX9-NEXT: s_lshl_b32 s5, s6, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -899,19 +903,19 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: sdiv16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s0, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX10-NEXT: s_sext_i32_i16 s2, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: v_add_nc_u16 v2, s1, 1 +; GFX10-NEXT: s_sext_i32_i16 s4, s3 +; GFX10-NEXT: v_add_nc_u16 v2, s3, 1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX10-NEXT: s_xor_b32 s5, s4, s0 +; GFX10-NEXT: s_xor_b32 s5, s4, s2 ; GFX10-NEXT: s_ashr_i32 s4, s5, 30 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -922,12 +926,12 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0| ; GFX10-NEXT: s_and_b32 s5, s5, exec_lo ; GFX10-NEXT: s_cselect_b32 s4, s4, 0 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s1 -; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s3 +; GFX10-NEXT: v_readfirstlane_b32 s3, v2 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_store_short v3, v2, s[2:3] +; GFX10-NEXT: global_store_short v3, v2, s[0:1] ; GFX10-NEXT: s_cbranch_vccz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -935,11 +939,11 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: sdiv16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s2 +; GFX11-NEXT: s_sext_i32_i16 s2, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -995,19 +999,19 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: srem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s3, 0x400 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_movk_i32 s2, 0x400 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 +; GFX9-NEXT: s_sext_i32_i16 s5, s3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX9-NEXT: s_xor_b32 s6, s5, s2 +; GFX9-NEXT: s_xor_b32 s6, s5, s4 ; GFX9-NEXT: s_ashr_i32 s6, s6, 30 ; GFX9-NEXT: s_or_b32 s8, s6, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -1016,16 +1020,18 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: v_add_u16_e64 v2, s4, 1 +; GFX9-NEXT: v_add_u16_e64 v2, s3, 1 ; GFX9-NEXT: s_cselect_b32 s6, s8, 0 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 -; GFX9-NEXT: s_and_b32 s7, 0xffff, s4 -; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s3 +; GFX9-NEXT: v_readfirstlane_b32 s3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, s6, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX9-NEXT: s_lshl_b32 s6, s7, 1 +; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -1034,19 +1040,19 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: srem16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s0, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX10-NEXT: s_sext_i32_i16 s2, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: v_add_nc_u16 v2, s1, 1 +; GFX10-NEXT: s_sext_i32_i16 s4, s3 +; GFX10-NEXT: v_add_nc_u16 v2, s3, 1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX10-NEXT: s_xor_b32 s5, s4, s0 +; GFX10-NEXT: s_xor_b32 s5, s4, s2 ; GFX10-NEXT: s_ashr_i32 s5, s5, 30 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -1059,13 +1065,13 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_cselect_b32 s5, s5, 0 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v3, s5, v3 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s1 -; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s3 +; GFX10-NEXT: v_readfirstlane_b32 s3, v2 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mul_lo_u32 v3, v3, s0 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, s2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3 -; GFX10-NEXT: global_store_short v2, v3, s[2:3] +; GFX10-NEXT: global_store_short v2, v3, s[0:1] ; GFX10-NEXT: s_cbranch_vccz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -1073,11 +1079,11 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: srem16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s2 +; GFX11-NEXT: s_sext_i32_i16 s2, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 9da07ea04ded5..011a366267afe 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -12,8 +12,8 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -40,8 +40,8 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -69,48 +69,49 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -149,8 +150,8 @@ entry: define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MulMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -178,8 +179,8 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MulMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -206,45 +207,46 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MulMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s2 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MulMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MulMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -288,8 +290,8 @@ entry: define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -316,8 +318,8 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -345,48 +347,49 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -422,8 +425,8 @@ entry: define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MixedTypedMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -450,8 +453,8 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MixedTypedMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -479,45 +482,46 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MixedTypedMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedTypedMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -562,8 +566,8 @@ entry: define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_alt_AddOperands: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -590,8 +594,8 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_alt_AddOperands: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -619,13 +623,13 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_alt_AddOperands: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -634,37 +638,38 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_alt_AddOperands: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -700,8 +705,8 @@ entry: define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MixedExt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -728,8 +733,8 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MixedExt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -757,45 +762,46 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MixedExt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedExt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedExt: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -840,8 +846,8 @@ entry: define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_SameVec: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -866,8 +872,8 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_SameVec: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -893,47 +899,48 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_SameVec: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s0, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_SameVec: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v2, s0, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_add3_u32 v1, v2, s2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_SameVec: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -979,8 +986,8 @@ entry: define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_v4i16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1008,8 +1015,8 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_v4i16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1037,48 +1044,49 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_v4i16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -1114,8 +1122,8 @@ entry: define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_v4i16_Hi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -1142,8 +1150,8 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_v4i16_Hi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1175,48 +1183,49 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_v4i16_Hi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16_Hi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -1252,8 +1261,8 @@ entry: define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_v4i16_Even: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1281,8 +1290,8 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_v4i16_Even: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1310,45 +1319,46 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_v4i16_Even: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 -; GFX9-NODL-NEXT: global_store_dword v4, v0, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s2, v0 +; GFX9-NODL-NEXT: global_store_dword v4, v0, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Even: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 -; GFX9-DL-NEXT: global_store_dword v4, v0, s[2:3] +; GFX9-DL-NEXT: v_add3_u32 v0, v1, s2, v0 +; GFX9-DL-NEXT: global_store_dword v4, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -1393,8 +1403,8 @@ entry: define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_v4i16_Middle: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1422,8 +1432,8 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_v4i16_Middle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1451,45 +1461,46 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_v4i16_Middle: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 -; GFX9-NODL-NEXT: global_store_dword v4, v0, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s2, v0 +; GFX9-NODL-NEXT: global_store_dword v4, v0, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Middle: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 -; GFX9-DL-NEXT: global_store_dword v4, v0, s[2:3] +; GFX9-DL-NEXT: v_add3_u32 v0, v1, s2, v0 +; GFX9-DL-NEXT: global_store_dword v4, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -1534,8 +1545,8 @@ entry: define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_DiffIndex: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1562,8 +1573,8 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_DiffIndex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1591,45 +1602,46 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_DiffIndex: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_DiffIndex: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_DiffIndex: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1674,8 +1686,8 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1703,8 +1715,8 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1733,49 +1745,50 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1825,8 +1838,8 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1854,8 +1867,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1884,49 +1897,50 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1976,8 +1990,8 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2005,8 +2019,8 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2035,13 +2049,13 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -2050,20 +2064,20 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -2072,16 +2086,17 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s2 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2133,8 +2148,8 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2162,8 +2177,8 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2192,13 +2207,13 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16 @@ -2207,20 +2222,20 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16 @@ -2229,16 +2244,17 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s2 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2290,8 +2306,8 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2319,8 +2335,8 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2349,13 +2365,13 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2363,20 +2379,20 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2384,16 +2400,17 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2445,8 +2462,8 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2474,8 +2491,8 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2504,13 +2521,13 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2518,20 +2535,20 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2539,16 +2556,17 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2600,8 +2618,8 @@ entry: define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2628,8 +2646,8 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2655,14 +2673,14 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -2670,19 +2688,19 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-NODL-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v4, v1, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -2690,21 +2708,21 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -2712,7 +2730,7 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2741,8 +2759,8 @@ entry: define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX7-LABEL: notsdot2_sext8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2769,8 +2787,8 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notsdot2_sext8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2800,13 +2818,13 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notsdot2_sext8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -2814,35 +2832,36 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notsdot2_sext8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0001 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notsdot2_sext8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index fdd913867c8f8..1d68b0ba0a280 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -44,8 +44,8 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -79,13 +79,13 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -93,31 +93,32 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -133,10 +134,13 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -193,8 +197,8 @@ entry: define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -235,8 +239,8 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -276,14 +280,14 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_bfe_i32 v6, v1, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -305,47 +309,49 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_sshort v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_sshort v4, v1, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_sshort v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_sshort v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v2, v3 -; GFX10-DL-NEXT: global_store_short v1, v4, s[2:3] +; GFX10-DL-NEXT: global_store_short v1, v4, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc16: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -402,8 +408,8 @@ entry: define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -436,8 +442,8 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -469,14 +475,14 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -490,47 +496,49 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -579,8 +587,8 @@ entry: define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_multiuse_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -614,8 +622,8 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -650,13 +658,13 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -667,37 +675,38 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v3, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -716,9 +725,12 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_multiuse_mul1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -784,8 +796,8 @@ entry: define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -818,8 +830,8 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -855,13 +867,13 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 @@ -872,31 +884,32 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s0, v3 +; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s2, v3 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -912,10 +925,13 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -958,8 +974,8 @@ entry: define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1000,8 +1016,8 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1037,15 +1053,15 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NODL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1054,35 +1070,35 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s0 -; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s0 +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s2 +; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s2 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s0 -; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s0 +; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s2 +; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1091,29 +1107,30 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s0 -; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s0 +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s2 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s0 -; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s0 +; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1150,10 +1167,13 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc16_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1225,8 +1245,8 @@ entry: define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_2ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1253,8 +1273,8 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_2ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1282,47 +1302,48 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_2ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_2ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_2ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1341,10 +1362,13 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_2ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -1393,8 +1417,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1424,8 +1448,8 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1456,13 +1480,13 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -1471,36 +1495,37 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1519,10 +1544,13 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -1578,8 +1606,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele_permuted: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1609,8 +1637,8 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele_permuted: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1641,13 +1669,13 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v3, 24, v1 @@ -1656,36 +1684,37 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020003 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020003 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1704,10 +1733,13 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -1762,8 +1794,8 @@ entry: define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_opt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1794,8 +1826,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_opt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1825,8 +1857,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_opt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1841,13 +1873,13 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_opt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1855,14 +1887,15 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_opt: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1876,10 +1909,13 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_opt: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1933,7 +1969,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -1970,7 +2006,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2008,7 +2044,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -2031,7 +2067,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_3src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -2053,7 +2089,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_3src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -2076,7 +2112,9 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -2143,7 +2181,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -2177,7 +2215,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2212,7 +2250,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -2234,7 +2272,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_3src_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -2258,7 +2296,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_3src_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -2282,7 +2320,9 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3src_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -2343,44 +2383,44 @@ entry: define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_bad_source: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s12, s[0:1], 0xf -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0xf +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s14, 0 +; GFX7-NEXT: s_mov_b32 s15, s11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 -; GFX7-NEXT: s_sext_i32_i16 s5, s12 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 +; GFX7-NEXT: s_sext_i32_i16 s0, s0 +; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 -; GFX7-NEXT: v_mad_i32_i24 v1, v3, s5, v1 +; GFX7-NEXT: v_mad_i32_i24 v1, v3, s0, v1 ; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8 ; GFX7-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_bad_source: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2390,14 +2430,14 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_sext_i32_i16 s3, s8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v2, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v2, s2, v1 +; GFX8-NEXT: v_mad_i32_i24 v1, v2, s3, v1 ; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 @@ -2411,49 +2451,49 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_bad_source: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s3, s8 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s2, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_bad_source: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201 -; GFX9-DL-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s8 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s2, v3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s4, v3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm @@ -2461,24 +2501,24 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX10-DL-LABEL: idot4_bad_source: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-DL-NEXT: s_sext_i32_i16 s3, s8 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201 ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s2, s3 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s3, s2 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -2486,23 +2526,25 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX11-DL-LABEL: idot4_bad_source: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c -; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b32 s8, s[2:3], 0x3c +; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_sext_i32_i16 s2, s2 -; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX11-DL-NEXT: s_sext_i32_i16 s3, s8 +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201 ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s2, s3 +; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s3, s2 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -2554,8 +2596,8 @@ entry: define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_commutative: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2585,8 +2627,8 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_commutative: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2617,13 +2659,13 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_commutative: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -2632,36 +2674,37 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_commutative: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_commutative: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -2680,10 +2723,13 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_commutative: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -2743,7 +2789,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src_3ele_src0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -2776,7 +2822,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src_3ele_src0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2810,7 +2856,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3] @@ -2832,7 +2878,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2856,7 +2902,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -2880,7 +2926,9 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -2943,25 +2991,25 @@ entry: define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_4src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s14, 0 -; GFX7-NEXT: s_mov_b32 s15, s3 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GFX7-NEXT: s_mov_b32 s15, 0xf000 +; GFX7-NEXT: s_mov_b32 s18, 0 +; GFX7-NEXT: s_mov_b32 s19, s15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX7-NEXT: s_mov_b64 s[16:17], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[16:17], s[6:7] +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x11 +; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s0, s[12:13], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 @@ -2969,7 +3017,7 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v5, v3, 0, 8 ; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s4 +; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v2, v4, 0, 8 ; GFX7-NEXT: v_bfe_i32 v4, v4, 8, 8 @@ -2979,14 +3027,14 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 ; GFX7-NEXT: v_mad_i32_i24 v1, v2, v4, v1 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_4src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -3029,9 +3077,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_4src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] @@ -3055,9 +3103,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_4src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501 ; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -3084,9 +3132,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_4src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x3 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3111,9 +3159,11 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_4src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x3 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -3193,8 +3243,8 @@ entry: define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_nonstandard_signed: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3230,8 +3280,8 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_nonstandard_signed: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3267,10 +3317,10 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_nonstandard_signed: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] @@ -3283,7 +3333,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 @@ -3292,15 +3342,15 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_nonstandard_signed: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] @@ -3313,7 +3363,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 @@ -3322,15 +3372,16 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_nonstandard_signed: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3360,9 +3411,12 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_nonstandard_signed: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 0b131ea74f1ab..fb94b504781b1 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -44,8 +44,8 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -79,13 +79,13 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -93,37 +93,38 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -131,10 +132,13 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -192,8 +196,8 @@ entry: define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -226,8 +230,8 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -264,15 +268,15 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -283,54 +287,56 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v4, v1, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc16: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -388,8 +394,8 @@ entry: define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -422,8 +428,8 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -455,14 +461,14 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -476,47 +482,49 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -566,8 +574,8 @@ entry: define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -594,8 +602,8 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -621,14 +629,14 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -636,57 +644,59 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 -; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-NODL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot2_8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -730,8 +740,8 @@ entry: define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -764,8 +774,8 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -797,14 +807,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -818,47 +828,49 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_CommutationInsideMAD: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -908,8 +920,8 @@ entry: define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_CommutationAccrossMADs: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -942,8 +954,8 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_CommutationAccrossMADs: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -975,14 +987,14 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -996,47 +1008,49 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -1086,8 +1100,8 @@ entry: define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_multiuse_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1121,8 +1135,8 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1157,13 +1171,13 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -1174,37 +1188,38 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s2 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s2 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1223,9 +1238,12 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_multiuse_mul1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1291,8 +1309,8 @@ entry: define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_multiuse_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1327,8 +1345,8 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_multiuse_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1364,13 +1382,13 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_multiuse_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8 @@ -1380,37 +1398,38 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s0 -; GFX9-NODL-NEXT: v_add_u32_e32 v4, s0, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s2 +; GFX9-NODL-NEXT: v_add_u32_e32 v4, s2, v2 ; GFX9-NODL-NEXT: v_add3_u32 v2, v2, v3, v6 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v1, v4 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_add_i32 s1, s0, s0 +; GFX9-DL-NEXT: s_add_i32 s3, s2, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 -; GFX9-DL-NEXT: v_add3_u32 v1, s1, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 +; GFX9-DL-NEXT: v_add3_u32 v1, s3, v3, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_add1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1428,9 +1447,12 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_multiuse_add1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1498,8 +1520,8 @@ entry: define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX7-LABEL: notdot4_mixedtypes: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1534,8 +1556,8 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notdot4_mixedtypes: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1572,15 +1594,15 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1591,27 +1613,27 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0302 +; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0302 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1622,25 +1644,25 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1656,14 +1678,16 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: notdot4_mixedtypes: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 @@ -1739,8 +1763,8 @@ entry: define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX7-LABEL: notdot4_mixedtypes2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1777,8 +1801,8 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notdot4_mixedtypes2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1817,15 +1841,15 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notdot4_mixedtypes2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1837,7 +1861,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v8, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v5, v6, v3 @@ -1845,20 +1869,20 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1870,7 +1894,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v8, v3 -; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v5, v6, v3 @@ -1878,14 +1902,14 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -1893,7 +1917,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1913,14 +1937,16 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: notdot4_mixedtypes2: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 @@ -2001,8 +2027,8 @@ entry: define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2035,8 +2061,8 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2070,13 +2096,13 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -2084,37 +2110,38 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -2122,10 +2149,13 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -2169,8 +2199,8 @@ entry: define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2207,8 +2237,8 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2243,16 +2273,16 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff -; GFX9-NODL-NEXT: s_mov_b32 s1, 0x5040100 +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_mov_b32 s3, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -2260,13 +2290,13 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NODL-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xff, v2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s1 -; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s1 -; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s1 -; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s1 +; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s3 +; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s3 +; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s3 +; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) @@ -2274,21 +2304,21 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff -; GFX9-DL-NEXT: s_mov_b32 s1, 0x5040100 +; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_mov_b32 s3, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -2296,13 +2326,13 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xff, v2 -; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s1 -; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s1 -; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s1 -; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s1 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s3 +; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -2310,14 +2340,15 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -2353,9 +2384,12 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc16_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -2427,8 +2461,8 @@ entry: define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2461,8 +2495,8 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2498,14 +2532,14 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc8_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -2521,19 +2555,19 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -2549,14 +2583,15 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2590,10 +2625,13 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc8_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -2662,8 +2700,8 @@ entry: define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_2ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2690,8 +2728,8 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_2ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2719,47 +2757,48 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_2ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_2ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_2ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -2777,10 +2816,13 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_2ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -2828,8 +2870,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2859,8 +2901,8 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2891,13 +2933,13 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -2906,36 +2948,37 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -2953,10 +2996,13 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -3011,8 +3057,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele_permuted: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3042,8 +3088,8 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele_permuted: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3074,13 +3120,13 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v1 @@ -3089,36 +3135,37 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020003 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020003 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -3136,10 +3183,13 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -3195,8 +3245,8 @@ entry: define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_opt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3227,8 +3277,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_opt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3258,8 +3308,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_opt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3274,13 +3324,13 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_opt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3288,14 +3338,15 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_opt: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3308,10 +3359,13 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_opt: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -3365,7 +3419,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -3402,7 +3456,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3440,7 +3494,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -3463,7 +3517,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_acc32_3src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3485,7 +3539,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_acc32_3src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -3507,7 +3561,9 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_3src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -3575,7 +3631,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -3609,7 +3665,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3644,7 +3700,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -3666,7 +3722,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_acc32_3src_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3690,7 +3746,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_acc32_3src_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -3713,7 +3769,9 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_3src_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -3776,44 +3834,44 @@ entry: define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_bad_source: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s12, s[0:1], 0xf -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0xf +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s14, 0 +; GFX7-NEXT: s_mov_b32 s15, s11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 -; GFX7-NEXT: s_and_b32 s5, s12, 0xffff -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff +; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v3, s5, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, s0, v1 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v5, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_bad_source: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -3823,14 +3881,14 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_and_b32 s3, s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v2, s2, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, v2, s3, v1 ; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 @@ -3844,49 +3902,49 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_bad_source: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NODL-NEXT: s_and_b32 s3, s8, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s2, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_bad_source: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201 -; GFX9-DL-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_and_b32 s4, s8, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s2, v3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s4, v3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm @@ -3894,24 +3952,24 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX10-DL-LABEL: udot4_bad_source: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_and_b32 s2, s2, 0xffff -; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-DL-NEXT: s_and_b32 s3, s8, 0xffff +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201 ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s2, s3 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s3, s2 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -3919,23 +3977,25 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX11-DL-LABEL: udot4_bad_source: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c -; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b32 s8, s[2:3], 0x3c +; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX11-DL-NEXT: s_and_b32 s3, s8, 0xffff +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201 ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s2, s3 +; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s3, s2 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2 ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -3987,8 +4047,8 @@ entry: define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_commutative: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -4018,8 +4078,8 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_commutative: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -4050,13 +4110,13 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_commutative: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -4065,36 +4125,37 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_commutative: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_commutative: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -4112,10 +4173,13 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_commutative: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -4175,7 +4239,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src_3ele_src0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -4208,7 +4272,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src_3ele_src0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4242,7 +4306,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3] @@ -4264,7 +4328,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -4288,7 +4352,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -4311,7 +4375,9 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -4374,25 +4440,25 @@ entry: define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_4src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s14, 0 -; GFX7-NEXT: s_mov_b32 s15, s3 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GFX7-NEXT: s_mov_b32 s15, 0xf000 +; GFX7-NEXT: s_mov_b32 s18, 0 +; GFX7-NEXT: s_mov_b32 s19, s15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX7-NEXT: s_mov_b64 s[16:17], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[16:17], s[6:7] +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x11 +; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s0, s[12:13], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 @@ -4400,7 +4466,7 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 8, 8 @@ -4410,14 +4476,14 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v2, v4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_4src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -4460,9 +4526,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_4src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] @@ -4486,9 +4552,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_4src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501 ; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -4515,9 +4581,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_4src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x3 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -4541,9 +4607,11 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_4src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x3 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -4623,8 +4691,8 @@ entry: define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_multi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -4666,8 +4734,8 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_multi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -4709,13 +4777,13 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_multi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v3, v2, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -4727,44 +4795,45 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v9, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s0, v6 +; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s2, v6 ; GFX9-NODL-NEXT: v_add3_u32 v3, v3, v7, v9 ; GFX9-NODL-NEXT: v_add3_u32 v0, v5, v3, v0 ; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v8, v1 -; GFX9-NODL-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_multi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0x6040200 -; GFX9-DL-NEXT: s_mov_b32 s1, 0x2000200 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x6040200 +; GFX9-DL-NEXT: s_mov_b32 s3, 0x2000200 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v2, s[6:7] -; GFX9-DL-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s4, 0x7050301 ; GFX9-DL-NEXT: s_mov_b32 s6, 0x3010301 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v4, v1, v0, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v1, v0, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v5, v3, v3, s1 +; GFX9-DL-NEXT: v_perm_b32 v5, v3, v3, s3 ; GFX9-DL-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v4, v5, s5 ; GFX9-DL-NEXT: v_perm_b32 v3, v3, v3, s6 ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v3, v1 -; GFX9-DL-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_multi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] @@ -4785,9 +4854,12 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_multi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b64 v[0:1], v2, s[4:5] @@ -4882,8 +4954,8 @@ entry: define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_hilo: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -4914,8 +4986,8 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_hilo: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -4947,8 +5019,8 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_hilo: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -4963,13 +5035,13 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_hilo: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -4977,14 +5049,15 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_hilo: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -4997,10 +5070,13 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_hilo: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] offset:4 @@ -5055,8 +5131,8 @@ entry: define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_lohi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -5087,8 +5163,8 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_lohi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -5120,8 +5196,8 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_lohi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] @@ -5136,33 +5212,34 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_lohi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0x10302 -; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x10302 +; GFX9-DL-NEXT: s_mov_b32 s3, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_lohi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 @@ -5178,10 +5255,13 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_lohi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4 @@ -5240,8 +5320,8 @@ entry: define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_hihi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -5272,8 +5352,8 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_hihi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -5307,8 +5387,8 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_hihi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -5323,33 +5403,34 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_hihi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0x1030200 -; GFX9-DL-NEXT: s_mov_b32 s1, 0x3010002 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x1030200 +; GFX9-DL-NEXT: s_mov_b32 s3, 0x3010002 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_hihi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 @@ -5365,10 +5446,13 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_hihi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4 @@ -5427,16 +5511,16 @@ entry: define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v8i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, s7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8 @@ -5450,17 +5534,17 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc32_v8i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -5474,19 +5558,19 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_v8i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 @@ -5500,12 +5584,12 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_v8i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 ; GFX9-DL-NEXT: global_store_dword v2, v0, s[4:5] @@ -5513,28 +5597,30 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_v8i8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_v8i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[0:1] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-DL-NEXT: s_nop 0 ; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm @@ -5581,8 +5667,8 @@ entry: define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v16i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -5616,8 +5702,8 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_v16i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5648,8 +5734,8 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_v16i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NODL-NEXT: ; kill: killed $vgpr5 @@ -5668,41 +5754,42 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NODL-NEXT: v_add3_u32 v0, v2, v6, v0 -; GFX9-NODL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_v16i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0x7050002 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x7050002 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] ; GFX9-DL-NEXT: global_load_dword v0, v5, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 +; GFX9-DL-NEXT: s_mov_b32 s3, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX9-DL-NEXT: ; kill: killed $vgpr5 ; GFX9-DL-NEXT: ; kill: killed $vgpr4 -; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 -; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s1 +; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s3 ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, 0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_v16i8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX10-DL-NEXT: ; kill: killed $vgpr5 ; GFX10-DL-NEXT: ; kill: killed $vgpr4 -; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] ; GFX10-DL-NEXT: global_load_dword v0, v5, s[6:7] @@ -5717,10 +5804,13 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_v16i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v4, s[6:7] @@ -5779,8 +5869,8 @@ entry: define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v256i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -5813,8 +5903,8 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_v256i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xfc @@ -5848,8 +5938,8 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_v256i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) @@ -5865,35 +5955,36 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v6, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_v256i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0x3020001 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v3, v1, s[4:5] offset:252 -; GFX9-DL-NEXT: s_mov_b32 s1, 0x1000302 +; GFX9-DL-NEXT: s_mov_b32 s3, 0x1000302 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s3 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_v256i8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_dword v2, v1, s[6:7] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] offset:252 @@ -5908,10 +5999,13 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_v256i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: global_load_b32 v1, v1, s[6:7] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:252 @@ -5969,8 +6063,8 @@ entry: define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_anyext: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -5996,8 +6090,8 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_anyext: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -6024,48 +6118,49 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_anyext: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_anyext: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0500 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0500 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0100 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s3 ; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_anyext: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -6082,10 +6177,13 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_anyext: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index 8c53d2671de3f..99bb4d50b03d4 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -12,13 +12,13 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -63,11 +63,11 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -78,10 +78,10 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 @@ -116,20 +116,20 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -154,54 +154,55 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v2, v3, s0, v4 +; GFX9-NEXT: v_add3_u32 v2, v3, s2, v4 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v12 ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v13, v14 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v6 ; GFX9-NEXT: v_mul_i32_i24_e32 v9, v15, v16 ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v8 ; GFX9-NEXT: v_add3_u32 v1, v2, v9, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc32: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 ; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] @@ -209,16 +210,17 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc32: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -305,13 +307,13 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -372,11 +374,11 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -389,11 +391,11 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 @@ -452,21 +454,21 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -520,26 +522,26 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -593,20 +595,21 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc16: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -676,16 +679,16 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-LABEL: idot8_acc16: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -826,13 +829,13 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc8: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -893,11 +896,11 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -910,11 +913,11 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 @@ -973,21 +976,21 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1041,26 +1044,26 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1114,20 +1117,21 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc8: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1197,16 +1201,16 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-LABEL: idot8_acc8: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1348,13 +1352,13 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1401,11 +1405,11 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1416,10 +1420,10 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 @@ -1456,20 +1460,20 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1490,7 +1494,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s0 +; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s2 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v6 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v7, v8 ; GFX9-NEXT: v_mad_i32_i24 v3, v3, v4, v2 @@ -1502,25 +1506,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8 ; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10 ; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -1541,7 +1545,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v7, v8 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v2 @@ -1553,20 +1557,21 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v7, v8 ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v9, v10 ; GFX9-DL-NEXT: v_add3_u32 v1, v3, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1610,15 +1615,16 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1737,13 +1743,13 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1788,11 +1794,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1803,10 +1809,10 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4 @@ -1841,20 +1847,20 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1 ; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4 @@ -1878,7 +1884,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -1886,47 +1892,48 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 ; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] @@ -1934,16 +1941,17 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1994,13 +2002,13 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2061,11 +2069,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2078,11 +2086,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 @@ -2141,22 +2149,22 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 @@ -2191,9 +2199,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v2 -; GFX9-NEXT: v_perm_b32 v7, v8, v7, s0 -; GFX9-NEXT: v_perm_b32 v8, v13, v12, s0 -; GFX9-NEXT: v_perm_b32 v5, v6, v5, s0 +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s2 +; GFX9-NEXT: v_perm_b32 v8, v13, v12, s2 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s2 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 @@ -2205,11 +2213,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v8 -; GFX9-NEXT: v_perm_b32 v2, v2, v4, s0 -; GFX9-NEXT: v_perm_b32 v1, v1, v11, s0 -; GFX9-NEXT: v_perm_b32 v4, v17, v16, s0 -; GFX9-NEXT: v_perm_b32 v9, v10, v9, s0 -; GFX9-NEXT: v_perm_b32 v10, v15, v14, s0 +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s2 +; GFX9-NEXT: v_perm_b32 v1, v1, v11, s2 +; GFX9-NEXT: v_perm_b32 v4, v17, v16, s2 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s2 +; GFX9-NEXT: v_perm_b32 v10, v15, v14, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2222,27 +2230,27 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] -; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 @@ -2277,9 +2285,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v17 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v2 -; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s0 -; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s0 -; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s2 +; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s2 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s2 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 @@ -2291,11 +2299,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v8 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s0 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s0 -; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s0 -; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s0 -; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s0 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s2 +; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s2 +; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2308,20 +2316,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2406,16 +2415,17 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2537,13 +2547,13 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2604,11 +2614,11 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2621,11 +2631,11 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3 @@ -2704,21 +2714,21 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 @@ -2791,26 +2801,26 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 ; GFX9-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 @@ -2883,21 +2893,22 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2988,16 +2999,17 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 3828fa557731e..779107cc40e1f 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -10,13 +10,13 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -61,11 +61,11 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -76,10 +76,10 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 @@ -114,20 +114,20 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 @@ -151,7 +151,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -159,47 +159,48 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -281,13 +282,13 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -332,11 +333,11 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -349,10 +350,10 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 @@ -385,20 +386,20 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -426,25 +427,25 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -472,27 +473,27 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -520,7 +521,7 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] +; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -599,13 +600,13 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc8: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -650,11 +651,11 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -667,10 +668,10 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 @@ -703,20 +704,20 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -744,25 +745,25 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -790,27 +791,27 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -838,7 +839,7 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -917,13 +918,13 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc4: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -969,11 +970,11 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -986,10 +987,10 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 @@ -1023,20 +1024,20 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1065,25 +1066,25 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1112,27 +1113,27 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1161,7 +1162,7 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1224,13 +1225,13 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1276,11 +1277,11 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1293,10 +1294,10 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 @@ -1330,20 +1331,20 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1372,25 +1373,25 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1419,27 +1420,27 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1468,7 +1469,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1529,13 +1530,13 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1582,11 +1583,11 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1597,10 +1598,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 @@ -1637,20 +1638,20 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_u32 v3, v1, 4, 4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1671,7 +1672,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s0 +; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s2 ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mad_u32_u24 v2, v3, v10, v1 @@ -1683,25 +1684,25 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4 ; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 4, 4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1722,7 +1723,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v10, v1 @@ -1734,20 +1735,21 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4 ; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1865,13 +1867,13 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1916,11 +1918,11 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1931,10 +1933,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 @@ -1969,20 +1971,20 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 @@ -2006,7 +2008,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -2014,47 +2016,48 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -2101,13 +2104,13 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2152,11 +2155,11 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2169,10 +2172,10 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 @@ -2205,21 +2208,21 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2228,16 +2231,16 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s2 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s2 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s2 ; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s2 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2248,9 +2251,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s2 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s2 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s2 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2259,26 +2262,26 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2287,16 +2290,16 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s2 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s2 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s2 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2307,9 +2310,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s2 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s2 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2318,20 +2321,21 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2424,13 +2428,13 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2475,11 +2479,11 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2492,10 +2496,10 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3 ; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4 @@ -2548,20 +2552,20 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: global_load_ubyte v4, v3, s[0:1] +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 @@ -2608,25 +2612,25 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 -; GFX9-NEXT: global_store_byte v3, v0, s[2:3] +; GFX9-NEXT: global_store_byte v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 @@ -2673,21 +2677,22 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 -; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] +; GFX9-DL-NEXT: global_store_byte v3, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2786,13 +2791,13 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc4_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2838,11 +2843,11 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc4_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2855,10 +2860,10 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s11, 0xe80000 -; GFX8-NEXT: s_add_u32 s8, s8, s3 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s15, 0xe80000 +; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 @@ -2892,21 +2897,21 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2915,16 +2920,16 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s2 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s2 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s2 ; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s2 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2935,9 +2940,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s2 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s2 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s2 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2947,26 +2952,26 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s10, -1 -; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2975,16 +2980,16 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s2 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s2 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s2 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2995,9 +3000,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s2 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s2 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -3007,20 +3012,21 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3109,8 +3115,8 @@ entry: define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX7-LABEL: udot8_variant1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3155,8 +3161,8 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; ; GFX8-LABEL: udot8_variant1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3202,13 +3208,13 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; ; GFX9-LABEL: udot8_variant1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v3, 15, v1 @@ -3233,7 +3239,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v6, v5 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v8, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v3, s0, v1 +; GFX9-NEXT: v_add3_u32 v1, v3, s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v10, v9 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v12, v11 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v5 @@ -3241,35 +3247,36 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v16, v15 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v9 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_variant1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll index f7a0e296fa173..0f40d010e2a3a 100644 --- a/llvm/test/CodeGen/AMDGPU/imm.ll +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) { ; SI-LABEL: i64_imm_inline_lo: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 5 @@ -17,7 +17,7 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) { ; ; VI-LABEL: i64_imm_inline_lo: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 5 @@ -34,7 +34,7 @@ entry: define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) { ; SI-LABEL: i64_imm_inline_hi: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x12345678 @@ -45,7 +45,7 @@ define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) { ; ; VI-LABEL: i64_imm_inline_hi: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x12345678 @@ -61,7 +61,7 @@ entry: define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { ; SI-LABEL: store_imm_neg_0.0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -72,7 +72,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -87,7 +87,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_neg_0.0_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -97,7 +97,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_neg_0.0_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -111,7 +111,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -121,7 +121,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -135,7 +135,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_imm_neg_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -145,7 +145,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -159,7 +159,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0.5 @@ -169,7 +169,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0.5 @@ -183,7 +183,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -0.5 @@ -193,7 +193,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -0.5 @@ -207,7 +207,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -217,7 +217,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -231,7 +231,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -1.0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1.0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 2.0 @@ -265,7 +265,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 2.0 @@ -279,7 +279,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -2.0 @@ -289,7 +289,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -2.0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -313,7 +313,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -327,7 +327,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -4.0 @@ -337,7 +337,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -4.0 @@ -351,7 +351,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_inv_2pi_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e22f983 @@ -361,7 +361,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_inv_2pi_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0.15915494 @@ -375,7 +375,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_inv_2pi_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xbe22f983 @@ -385,7 +385,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xbe22f983 @@ -399,7 +399,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x45800000 @@ -409,7 +409,7 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x45800000 @@ -423,8 +423,8 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -434,8 +434,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -450,8 +450,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -461,8 +461,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -477,8 +477,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -488,8 +488,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -504,8 +504,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -515,8 +515,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -531,8 +531,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -542,8 +542,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -558,8 +558,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -569,8 +569,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -585,8 +585,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -596,8 +596,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -612,8 +612,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -623,8 +623,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -639,8 +639,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -650,8 +650,8 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -666,7 +666,7 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: commute_add_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -684,7 +684,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, ; ; VI-LABEL: commute_add_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -708,7 +708,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: commute_add_literal_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -726,7 +726,7 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: commute_add_literal_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -750,8 +750,8 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -761,8 +761,8 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) ; ; VI-LABEL: add_inline_imm_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -777,8 +777,8 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_2_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -788,8 +788,8 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) ; ; VI-LABEL: add_inline_imm_2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -804,8 +804,8 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -815,8 +815,8 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: add_inline_imm_16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -831,8 +831,8 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -843,8 +843,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float ; ; VI-LABEL: add_inline_imm_neg_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -862,8 +862,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_2_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -874,8 +874,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float ; ; VI-LABEL: add_inline_imm_neg_2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -893,8 +893,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -905,8 +905,8 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa ; ; VI-LABEL: add_inline_imm_neg_16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -924,8 +924,8 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_63_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -935,8 +935,8 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: add_inline_imm_63_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -951,8 +951,8 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_64_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -962,8 +962,8 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: add_inline_imm_64_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -978,23 +978,25 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], 0 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0.0 @@ -1005,23 +1007,25 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 0.5 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0.5 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0.5 @@ -1032,23 +1036,25 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], -0.5 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], -0.5 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -0.5 @@ -1059,23 +1065,25 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 1.0 @@ -1086,23 +1094,25 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], -1.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], -1.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -1.0 @@ -1113,23 +1123,25 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 2.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 2.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 2.0 @@ -1140,23 +1152,25 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], -2.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], -2.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -2.0 @@ -1167,23 +1181,25 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 4.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 4.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 4.0 @@ -1194,23 +1210,25 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], -4.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], -4.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -4.0 @@ -1221,25 +1239,27 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; SI-NEXT: v_mov_b32_e32 v1, 0x3fc45f30 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.15915494309189532 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0.15915494309189532 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x3fc45f306dc9c882 @@ -1250,27 +1270,29 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_m_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; SI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_m_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; VI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0xbfc45f306dc9c882 @@ -1281,23 +1303,25 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], d define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], 1 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000001 @@ -1308,23 +1332,25 @@ define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32] define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], 2 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 2 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000002 @@ -1335,23 +1361,25 @@ define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32] define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_16_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], 16 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 16 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_16_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 16 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 16 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000010 @@ -1362,7 +1390,7 @@ define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, -1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1373,7 +1401,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, -1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1389,7 +1417,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -2 @@ -1400,7 +1428,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_2_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -2 @@ -1416,7 +1444,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_16_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -16 @@ -1427,7 +1455,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_16_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -16 @@ -1443,23 +1471,25 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_63_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], 63 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 63 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_63_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 63 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 63 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x000000000000003F @@ -1470,23 +1500,25 @@ define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_64_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[2:3], 64 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_64_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 64 +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000040 @@ -1497,7 +1529,7 @@ define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1508,7 +1540,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1523,7 +1555,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_neg_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1534,7 +1566,7 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) ; ; VI-LABEL: store_literal_imm_neg_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1549,7 +1581,7 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1560,7 +1592,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1575,7 +1607,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1586,7 +1618,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1601,7 +1633,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1612,7 +1644,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1627,7 +1659,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1638,7 +1670,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1653,7 +1685,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1664,7 +1696,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1679,7 +1711,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1690,7 +1722,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1705,7 +1737,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1716,7 +1748,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1731,7 +1763,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1742,7 +1774,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1757,7 +1789,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1768,7 +1800,7 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1783,7 +1815,7 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1794,7 +1826,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1809,7 +1841,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1820,7 +1852,7 @@ define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll index dcc615232e56b..f407a1c26dd3e 100644 --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_neg_0.0_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -20,7 +20,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_neg_0.0_i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -33,7 +33,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_neg_0.0_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] @@ -44,7 +44,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_neg_0.0_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -59,7 +59,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_0.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -69,7 +69,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_0.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -81,7 +81,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -91,7 +91,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -105,7 +105,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_imm_neg_0.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -115,7 +115,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_imm_neg_0.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -127,7 +127,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] @@ -137,7 +137,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_imm_neg_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -151,7 +151,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -161,7 +161,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -173,7 +173,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] @@ -183,7 +183,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3800 @@ -197,7 +197,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -207,7 +207,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -219,7 +219,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] @@ -229,7 +229,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xb800 @@ -243,7 +243,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_1.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -253,7 +253,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_1.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -265,7 +265,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] @@ -275,7 +275,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 @@ -289,7 +289,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_1.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -299,7 +299,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_1.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -311,7 +311,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] @@ -321,7 +321,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xbc00 @@ -335,7 +335,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_2.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -345,7 +345,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_2.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -357,7 +357,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] @@ -367,7 +367,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x4000 @@ -381,7 +381,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_2.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -391,7 +391,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_2.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -403,7 +403,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] @@ -413,7 +413,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xc000 @@ -427,7 +427,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_4.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -437,7 +437,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_4.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -449,7 +449,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] @@ -459,7 +459,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x4400 @@ -473,7 +473,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_4.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -483,7 +483,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_4.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -495,7 +495,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] @@ -505,7 +505,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xc400 @@ -519,7 +519,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_inv_2pi_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -529,7 +529,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_inv_2pi_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -541,7 +541,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_inv_2pi_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] @@ -551,7 +551,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_inv_2pi_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3118 @@ -565,7 +565,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_inv_2pi_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -575,7 +575,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: store_inline_imm_m_inv_2pi_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -587,7 +587,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] @@ -597,7 +597,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; SI-LABEL: store_inline_imm_m_inv_2pi_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xb118 @@ -611,7 +611,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_literal_imm_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -621,7 +621,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_literal_imm_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -633,7 +633,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] @@ -643,7 +643,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_literal_imm_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6c00 @@ -658,8 +658,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_0.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] @@ -670,12 +670,12 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_0.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x00,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -683,25 +683,26 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, 0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x00,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x00,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0.0 @@ -713,8 +714,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_0.5_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] @@ -725,12 +726,12 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_0.5_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe0,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -738,25 +739,26 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, 0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe0,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe0,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0.5, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0.5 @@ -768,8 +770,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_0.5_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] @@ -780,12 +782,12 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_0.5_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe2,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -793,25 +795,26 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, -0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe2,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe2,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -0.5, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -0.5 @@ -823,8 +826,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_1.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] @@ -835,12 +838,12 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_1.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe4,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -848,25 +851,26 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, 1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe4,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe4,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 1.0 @@ -878,8 +882,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_1.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] @@ -890,12 +894,12 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_1.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe6,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -903,25 +907,26 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, -1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe6,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe6,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -1.0 @@ -933,8 +938,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_2.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] @@ -945,12 +950,12 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_2.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe8,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -958,25 +963,26 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, 2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe8,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe8,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 2.0 @@ -988,8 +994,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_2.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] @@ -1000,12 +1006,12 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_2.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xea,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1013,25 +1019,26 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, -2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xea,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xea,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -2.0 @@ -1043,8 +1050,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_4.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] @@ -1055,12 +1062,12 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_4.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xec,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1068,25 +1075,26 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, 4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xec,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xec,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 4.0 @@ -1098,8 +1106,8 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_4.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] @@ -1110,12 +1118,12 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_4.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xee,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1123,25 +1131,26 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, -4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xee,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xee,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -4.0 @@ -1152,7 +1161,7 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: commute_add_inline_imm_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1170,7 +1179,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; GFX11-LABEL: commute_add_inline_imm_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1190,7 +1199,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; VI-LABEL: commute_add_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1208,7 +1217,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; SI-LABEL: commute_add_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1234,7 +1243,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: commute_add_literal_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1252,7 +1261,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: commute_add_literal_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1272,7 +1281,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: commute_add_literal_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1290,7 +1299,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; SI-LABEL: commute_add_literal_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1317,8 +1326,8 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_1_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] @@ -1329,12 +1338,12 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_1_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x02,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1342,25 +1351,26 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, 1 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x02,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x02,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x33800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0001 @@ -1372,8 +1382,8 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_2_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] @@ -1384,12 +1394,12 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_2_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x04,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1397,25 +1407,26 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, 2 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x04,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x04,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_2_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x34000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0002 @@ -1427,8 +1438,8 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_16_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] @@ -1439,12 +1450,12 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_16_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x20,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1452,25 +1463,26 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_16_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, 16 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x20,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x20,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_16_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x35800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0010 @@ -1481,7 +1493,7 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_1_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1499,7 +1511,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: add_inline_imm_neg_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1519,7 +1531,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: add_inline_imm_neg_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1537,7 +1549,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; SI-LABEL: add_inline_imm_neg_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1562,7 +1574,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_2_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1580,7 +1592,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: add_inline_imm_neg_2_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1600,7 +1612,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: add_inline_imm_neg_2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1618,7 +1630,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; SI-LABEL: add_inline_imm_neg_2_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1643,7 +1655,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_16_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1661,7 +1673,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: add_inline_imm_neg_16_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1681,7 +1693,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; VI-LABEL: add_inline_imm_neg_16_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1699,7 +1711,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; SI-LABEL: add_inline_imm_neg_16_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1725,8 +1737,8 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_63_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] @@ -1737,12 +1749,12 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_63_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x7e,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1750,25 +1762,26 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_63_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, 63 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x7e,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x7e,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_63_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x367c0000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH003F @@ -1780,8 +1793,8 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_64_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] @@ -1792,12 +1805,12 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_64_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x80,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1805,25 +1818,26 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_64_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s6, 64 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x80,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x80,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_64_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x36800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0040 diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll index ae51c3edf1c7e..342d7b0237118 100644 --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -665,4 +665,4 @@ define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) { ret <2 x i16> %y } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index 72f10ea892e53..b89dbd42e0466 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 ; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 @@ -33,8 +33,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 ; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 @@ -56,7 +56,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 @@ -80,7 +80,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 @@ -111,8 +111,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 -; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 +; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -123,8 +123,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xcc -; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc +; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -135,7 +135,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9V4-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 @@ -147,7 +147,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V5-NEXT: s_load_dword s2, s[6:7], 0x4 ; GFX9V5-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 @@ -165,8 +165,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 -; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 +; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -177,8 +177,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xc8 -; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -189,7 +189,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 @@ -201,7 +201,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V5-NEXT: s_load_dword s2, s[6:7], 0x4 ; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 @@ -219,12 +219,12 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { define amdgpu_kernel void @llvm_trap() { ; GFX8V4-LABEL: llvm_trap: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V4-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX8V4-NEXT: s_trap 2 ; ; GFX8V5-LABEL: llvm_trap: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_trap 2 ; diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll index 4c5c136f5333f..7c8d89ef03b1b 100644 --- a/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s | FileCheck %s +; RUN: opt -passes=amdgpu-attributor < %s | llc | FileCheck %s target triple = "amdgcn-amd-amdhsa" diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index 47110d9491887..eb4cba35e9946 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -11,59 +11,63 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX9-LABEL: indirect_call_known_no_special_inputs: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, wobble@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, wobble@gotpcrel32@hi+12 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, snork@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, snork@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, 1, s7 -; GFX9-NEXT: s_cmp_eq_u32 s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: s_cselect_b32 s5, s13, s11 -; GFX9-NEXT: s_cselect_b32 s4, s12, s10 -; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_and_b32 s8, 1, s15 +; GFX9-NEXT: s_cmp_eq_u32 s8, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_cselect_b32 s17, s21, s19 +; GFX9-NEXT: s_cselect_b32 s16, s20, s18 +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: indirect_call_known_no_special_inputs: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_getpc_b64 s[2:3] -; GFX12-NEXT: s_sext_i32_i16 s3, s3 -; GFX12-NEXT: s_add_co_u32 s2, s2, snork@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s3, s3, snork@gotpcrel32@hi+16 -; GFX12-NEXT: s_mov_b64 s[0:1], 0 -; GFX12-NEXT: s_getpc_b64 s[4:5] -; GFX12-NEXT: s_sext_i32_i16 s5, s5 -; GFX12-NEXT: s_add_co_u32 s4, s4, wobble@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s5, s5, wobble@gotpcrel32@hi+16 -; GFX12-NEXT: s_load_u8 s6, s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_getpc_b64 s[6:7] +; GFX12-NEXT: s_sext_i32_i16 s7, s7 +; GFX12-NEXT: s_add_co_u32 s6, s6, snork@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+16 +; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], 0 +; GFX12-NEXT: s_getpc_b64 s[8:9] +; GFX12-NEXT: s_sext_i32_i16 s9, s9 +; GFX12-NEXT: s_add_co_u32 s8, s8, wobble@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s9, s9, wobble@gotpcrel32@hi+16 +; GFX12-NEXT: s_load_u8 s12, s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 +; GFX12-NEXT: s_load_b64 s[6:7], s[8:9], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 ; GFX12-NEXT: v_mov_b32_e32 v31, v0 -; GFX12-NEXT: s_mov_b64 s[8:9], 0 ; GFX12-NEXT: s_mov_b32 s32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_and_b32 s4, 1, s6 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s4, 1 -; GFX12-NEXT: s_cselect_b32 s1, s3, s1 -; GFX12-NEXT: s_cselect_b32 s0, s2, s0 -; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-NEXT: s_and_b32 s8, 1, s12 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s8, 1 +; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-NEXT: s_cselect_b32 s7, s7, s5 +; GFX12-NEXT: s_cselect_b32 s6, s6, s4 +; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX12-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 8183106b0ce9d..f54a511eff7f1 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -27,7 +27,6 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[DUMMYRETURNBLOCK:%.*]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void -; entry: br label %loop @@ -40,10 +39,10 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; SI-NEXT: s_cbranch_execz .LBB1_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -67,7 +66,6 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void -; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond = icmp eq i32 %tmp, 1 @@ -84,7 +82,7 @@ return: define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 ; SI-NEXT: s_cbranch_scc1 .LBB2_4 ; SI-NEXT: ; %bb.1: @@ -131,7 +129,6 @@ define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void -; entry: br i1 undef, label %loop1, label %loop2 @@ -148,10 +145,10 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_nest_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; SI-NEXT: s_cbranch_execz .LBB3_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -192,7 +189,6 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 [[COND3]], label [[INNER_LOOP]], label [[OUTER_LOOP]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void -; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 76b007c22b699..4d62d30a38ed3 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %4 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %11 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %4 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %9 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %9 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() @@ -27,15 +27,15 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %4 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %11 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %11 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %4 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %9 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() @@ -46,15 +46,15 @@ define amdgpu_kernel void @v_input_output_i128() { define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %4 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %4 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %11 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %11 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %4 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %9 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll index 4fecdb576a6de..e7a7b8a335d0d 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll @@ -6,17 +6,20 @@ ; GCN: define amdgpu_kernel void @caller(ptr addrspace(1) nocapture %p) local_unnamed_addr #1 { ; GCN: %mul.i = fmul float %load, 1.500000e+01 -; UNSAFE: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "unsafe-fp-math"="true" } -; UNSAFE: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="true" } +; UNSAFE: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; UNSAFE: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NOINFS: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-infs-fp-math"="true" } -; NOINFS: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "unsafe-fp-math"="false" } +; NOINFS: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "no-infs-fp-math"="true" "uniform-work-group-size"="false" } +; NOINFS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="false" } -; NONANS: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-nans-fp-math"="true" } -; NONANS: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "unsafe-fp-math"="false" } +; NONANS: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "no-nans-fp-math"="true" "uniform-work-group-size"="false" } +; NONANS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="false" } + +declare void @extern() #0 define float @foo(float %x) #0 { entry: + call void @extern() %mul = fmul float %x, 1.500000e+01 ret float %mul } @@ -24,7 +27,7 @@ entry: define amdgpu_kernel void @caller(ptr addrspace(1) %p) #1 { entry: %load = load float, ptr addrspace(1) %p, align 4 - %call = call fast float @foo(float %load) #0 + %call = call fast float @foo(float %load) store float %call, ptr addrspace(1) %p, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll index 46b2eb30c791c..807a7d26f49e5 100644 --- a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll @@ -10,7 +10,7 @@ entry: } ; GCN-LABEL: {{^}}inline_asm_input_v2f16: -; GCN: s_mov_b32 s0, s{{[0-9]+}} +; GCN: s_mov_b32 s2, s{{[0-9]+}} define amdgpu_kernel void @inline_asm_input_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { entry: %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index f736ca7cd625a..b62bf890e65fe 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -4,22 +4,22 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) { ; GCN-LABEL: float4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 3 ; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cmp_lg_u32 s8, 2 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float4_inselt_undef: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v2, v0 @@ -56,23 +56,23 @@ entry: define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) { ; GCN-LABEL: int4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 3 -; GCN-NEXT: s_cselect_b32 s3, s7, 1 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 -; GCN-NEXT: s_cselect_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cselect_b32 s2, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cselect_b32 s3, s6, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s2, s4, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, 1 ; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -85,15 +85,15 @@ entry: define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) { ; GCN-LABEL: float2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: s_cmp_lg_u32 s6, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc @@ -109,21 +109,21 @@ entry: define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec, i32 %sel) { ; GCN-LABEL: float8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GCN-NEXT: s_load_dword s12, s[2:3], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 ; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: s_mov_b32 m0, s12 ; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v8, s2 @@ -142,14 +142,14 @@ entry: define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %vec, i32 %sel) { ; GCN-LABEL: float16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s20, s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_add_u32 s0, s2, 48 -; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 @@ -166,24 +166,24 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v14, s18 ; GCN-NEXT: v_mov_b32_e32 v15, s19 ; GCN-NEXT: s_mov_b32 m0, s20 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: s_add_u32 s0, s2, 32 +; GCN-NEXT: v_mov_b32_e32 v16, s2 +; GCN-NEXT: s_add_u32 s2, s0, 32 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s1 -; GCN-NEXT: v_mov_b32_e32 v12, s0 -; GCN-NEXT: s_add_u32 s0, s2, 16 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v13, s3 +; GCN-NEXT: v_mov_b32_e32 v12, s2 +; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 -; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: v_mov_b32_e32 v8, s2 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -195,18 +195,18 @@ entry: define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) { ; GCN-LABEL: float32_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x124 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_add_u32 s0, s2, 0x70 -; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v33, s1 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 ; GCN-NEXT: v_mov_b32_e32 v5, s41 @@ -236,48 +236,48 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NEXT: v_mov_b32_e32 v30, s18 ; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: v_mov_b32_e32 v32, s0 -; GCN-NEXT: s_add_u32 s0, s2, 0x60 +; GCN-NEXT: v_mov_b32_e32 v32, s2 +; GCN-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v29, s1 -; GCN-NEXT: v_mov_b32_e32 v28, s0 -; GCN-NEXT: s_add_u32 s0, s2, 0x50 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v29, s3 +; GCN-NEXT: v_mov_b32_e32 v28, s2 +; GCN-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v25, s1 -; GCN-NEXT: v_mov_b32_e32 v24, s0 -; GCN-NEXT: s_add_u32 s0, s2, 64 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v25, s3 +; GCN-NEXT: v_mov_b32_e32 v24, s2 +; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s1 -; GCN-NEXT: v_mov_b32_e32 v20, s0 -; GCN-NEXT: s_add_u32 s0, s2, 48 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v21, s3 +; GCN-NEXT: v_mov_b32_e32 v20, s2 +; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s1 -; GCN-NEXT: v_mov_b32_e32 v16, s0 -; GCN-NEXT: s_add_u32 s0, s2, 32 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v17, s3 +; GCN-NEXT: v_mov_b32_e32 v16, s2 +; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s1 -; GCN-NEXT: v_mov_b32_e32 v12, s0 -; GCN-NEXT: s_add_u32 s0, s2, 16 -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v13, s3 +; GCN-NEXT: v_mov_b32_e32 v12, s2 +; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 -; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: v_mov_b32_e32 v8, s2 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -289,8 +289,8 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -314,7 +314,7 @@ entry: define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) { ; GCN-LABEL: half2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -335,49 +335,49 @@ entry: define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, i32 %sel) { ; GCN-LABEL: half8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s3, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s2, 7 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_lshr_b32 s2, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 7 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 6 +; GCN-NEXT: s_cmp_lg_u32 s8, 6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s6, 16 +; GCN-NEXT: s_lshr_b32 s2, s6, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 5 +; GCN-NEXT: s_cmp_lg_u32 s8, 5 ; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 4 +; GCN-NEXT: s_cmp_lg_u32 s8, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s5, 16 +; GCN-NEXT: s_lshr_b32 s2, s5, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 3 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cmp_lg_u32 s8, 2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s4, 16 +; GCN-NEXT: s_lshr_b32 s2, s4, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -397,7 +397,7 @@ entry: define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) { ; GCN-LABEL: short2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -418,8 +418,8 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +443,8 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s4, s4, 3 ; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 @@ -467,99 +467,99 @@ entry: define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s3, s7, 24 -; GCN-NEXT: s_cmp_lg_u32 s2, 15 -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: s_lshr_b32 s2, s7, 24 +; GCN-NEXT: s_cmp_lg_u32 s8, 15 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s2, 14 +; GCN-NEXT: s_lshr_b32 s2, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 14 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s7, 8 +; GCN-NEXT: s_lshr_b32 s2, s7, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 13 +; GCN-NEXT: s_cmp_lg_u32 s8, 13 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 12 +; GCN-NEXT: s_cmp_lg_u32 s8, 12 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s3, s6, 24 +; GCN-NEXT: s_lshr_b32 s2, s6, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 11 +; GCN-NEXT: s_cmp_lg_u32 s8, 11 ; GCN-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s6, 16 -; GCN-NEXT: s_cmp_lg_u32 s2, 10 +; GCN-NEXT: s_lshr_b32 s2, s6, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 10 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s6, 8 +; GCN-NEXT: s_lshr_b32 s2, s6, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 9 +; GCN-NEXT: s_cmp_lg_u32 s8, 9 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 8 +; GCN-NEXT: s_cmp_lg_u32 s8, 8 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s3, s5, 24 +; GCN-NEXT: s_lshr_b32 s2, s5, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: s_cmp_lg_u32 s8, 7 ; GCN-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s5, 16 -; GCN-NEXT: s_cmp_lg_u32 s2, 6 +; GCN-NEXT: s_lshr_b32 s2, s5, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 6 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s5, 8 +; GCN-NEXT: s_lshr_b32 s2, s5, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 5 +; GCN-NEXT: s_cmp_lg_u32 s8, 5 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 4 +; GCN-NEXT: s_cmp_lg_u32 s8, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_lshr_b32 s3, s4, 24 +; GCN-NEXT: s_lshr_b32 s2, s4, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 3 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s4, 16 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s3, s4, 8 +; GCN-NEXT: s_lshr_b32 s2, s4, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 ; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -580,21 +580,21 @@ entry: define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) { ; GCN-LABEL: double2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s2, 1 -; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s7 -; GCN-NEXT: s_cselect_b32 s6, 0, s6 -; GCN-NEXT: s_cmp_eq_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s5 +; GCN-NEXT: s_cmp_eq_u32 s8, 1 +; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s7 +; GCN-NEXT: s_cselect_b32 s3, 0, s6 +; GCN-NEXT: s_cmp_eq_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5 ; GCN-NEXT: s_cselect_b32 s4, 0, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -607,10 +607,10 @@ entry: define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) { ; GCN-LABEL: double5_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84 -; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 +; GCN-NEXT: s_load_dword s12, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x84 +; GCN-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s12, 4 ; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9 @@ -661,12 +661,12 @@ entry: define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) { ; GCN-LABEL: double8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s20, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 @@ -717,17 +717,17 @@ entry: define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %vec, i32 %sel) { ; GCN-LABEL: double7_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x94 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x84 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x94 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x84 +; GCN-NEXT: s_load_dword s2, s[2:3], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: s_lshl_b32 s0, s0, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 @@ -738,25 +738,25 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NEXT: v_mov_b32_e32 v12, s16 ; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; GCN-NEXT: s_add_u32 s0, s2, 16 +; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: v_movreld_b32_e32 v1, v16 -; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v15, s1 -; GCN-NEXT: v_mov_b32_e32 v14, s0 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v15, s3 +; GCN-NEXT: v_mov_b32_e32 v14, s2 ; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-NEXT: s_add_u32 s0, s2, 48 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_add_u32 s0, s2, 32 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_add_u32 s0, s0, 32 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] -; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -770,14 +770,15 @@ entry: define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) { ; GCN-LABEL: double16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x124 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, 1 +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 @@ -809,7 +810,7 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NEXT: v_mov_b32_e32 v30, s18 ; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -867,20 +868,22 @@ entry: define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> %vec, i32 %sel) { ; GCN-LABEL: double15_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x114 -; GCN-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x104 -; GCN-NEXT: s_load_dwordx8 s[24:31], s[0:1], 0xe4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x114 +; GCN-NEXT: s_load_dwordx4 s[20:23], s[2:3], 0x104 +; GCN-NEXT: s_load_dwordx8 s[24:31], s[2:3], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_load_dword s4, s[0:1], 0x124 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v28, s2 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x124 +; GCN-NEXT: v_mov_b32_e32 v28, s0 +; GCN-NEXT: v_mov_b32_e32 v29, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s4, 1 +; GCN-NEXT: s_lshl_b32 s0, s4, 1 +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 @@ -906,9 +909,8 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v25, s21 ; GCN-NEXT: v_mov_b32_e32 v26, s22 ; GCN-NEXT: v_mov_b32_e32 v27, s23 -; GCN-NEXT: v_mov_b32_e32 v29, s3 -; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NEXT: v_movreld_b32_e32 v1, v32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -962,13 +964,13 @@ entry: define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xe80000 -; GCN-NEXT: s_add_u32 s4, s4, s3 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_addc_u32 s5, s5, 0 +; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: s_mov_b32 s15, 0xe80000 +; GCN-NEXT: s_add_u32 s12, s12, s9 +; GCN-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 3 @@ -980,16 +982,16 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_and_b32_e32 v3, 3, v3 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 -; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0 offset:3 -; GCN-NEXT: buffer_store_byte v3, off, s[4:7], 0 offset:2 -; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 +; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 +; GCN-NEXT: buffer_store_byte v4, off, s[12:15], 0 offset:3 +; GCN-NEXT: buffer_store_byte v3, off, s[12:15], 0 offset:2 +; GCN-NEXT: buffer_store_byte v2, off, s[12:15], 0 offset:1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: buffer_store_byte v1, v0, s[4:7], 0 offen -; GCN-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:1 -; GCN-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:2 -; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:3 +; GCN-NEXT: buffer_store_byte v1, v0, s[12:15], 0 offen +; GCN-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 +; GCN-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:1 +; GCN-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2 +; GCN-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:3 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) @@ -1017,11 +1019,11 @@ entry: define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit128_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x44 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s1, s4, 24 +; GCN-NEXT: s_lshr_b32 s3, s4, 24 ; GCN-NEXT: s_lshr_b32 s8, s4, 16 ; GCN-NEXT: s_lshr_b32 s9, s4, 17 ; GCN-NEXT: s_lshr_b32 s10, s4, 18 @@ -1057,10 +1059,10 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_lshr_b32 s41, s7, 21 ; GCN-NEXT: s_lshr_b32 s42, s7, 22 ; GCN-NEXT: s_lshr_b32 s43, s7, 23 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x77 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x77 ; GCN-NEXT: v_mov_b32_e32 v15, s43 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x76 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x76 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s42 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1068,11 +1070,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x75 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x75 ; GCN-NEXT: v_or_b32_e32 v15, v15, v18 ; GCN-NEXT: v_mov_b32_e32 v18, s41 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x74 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x74 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s40 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1081,11 +1083,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x73 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x73 ; GCN-NEXT: v_or_b32_e32 v15, v18, v15 ; GCN-NEXT: v_mov_b32_e32 v18, s39 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x72 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x72 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s38 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1093,11 +1095,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x71 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x71 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_mov_b32_e32 v19, s37 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x70 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x70 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_mov_b32_e32 v20, s36 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1109,11 +1111,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7f +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7f ; GCN-NEXT: v_or_b32_e32 v15, v18, v15 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7e +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s35 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1121,11 +1123,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7d +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7c +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1134,22 +1136,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7b +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7a +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x78 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x78 ; GCN-NEXT: v_mov_b32_e32 v13, s35 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x79 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x79 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s35 ; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc @@ -1164,11 +1166,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6f +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6f ; GCN-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v18, 15, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6e +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 14, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1176,11 +1178,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6d +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 13, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6c +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 12, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1189,11 +1191,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6b +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6a +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 10, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1201,11 +1203,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x69 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x69 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 9, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x68 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x68 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 8, s7 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1217,11 +1219,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v17, v17, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x67 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x67 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x66 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x66 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1229,11 +1231,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x65 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x65 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x64 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x64 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1242,11 +1244,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x63 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x63 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x62 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x62 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1254,11 +1256,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x61 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x61 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x60 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x60 ; GCN-NEXT: v_mov_b32_e32 v16, s7 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1271,11 +1273,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 4, v18 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 ; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x57 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x57 ; GCN-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v17, s34 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x56 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x56 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s33 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1283,11 +1285,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x55 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x55 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 ; GCN-NEXT: v_mov_b32_e32 v18, s31 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x54 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x54 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s30 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1296,11 +1298,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x53 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x53 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_mov_b32_e32 v18, s29 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x52 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x52 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s28 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1308,11 +1310,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x51 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x51 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_mov_b32_e32 v19, s27 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x50 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x50 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_mov_b32_e32 v20, s26 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1324,11 +1326,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 ; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5f +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5f ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5e +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s25 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1336,11 +1338,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5d +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5c +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1349,22 +1351,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5b +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5a +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x58 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x58 ; GCN-NEXT: v_mov_b32_e32 v3, s25 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x59 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x59 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s25 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc @@ -1378,11 +1380,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v3, v18, v3 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4f +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4f ; GCN-NEXT: v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v3, 15, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4e +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4e ; GCN-NEXT: v_lshrrev_b16_e64 v18, 14, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1390,11 +1392,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4d +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4d ; GCN-NEXT: v_or_b32_e32 v3, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 13, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4c +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4c ; GCN-NEXT: v_lshrrev_b16_e64 v19, 12, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1403,11 +1405,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4b +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4b ; GCN-NEXT: v_or_b32_e32 v3, v18, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 11, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4a +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4a ; GCN-NEXT: v_lshrrev_b16_e64 v19, 10, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1415,11 +1417,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x49 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x49 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 9, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x48 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x48 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 8, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1431,11 +1433,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 12, v3 ; GCN-NEXT: v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x47 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x47 ; GCN-NEXT: v_or_b32_e32 v18, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 7, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x46 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x46 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1443,11 +1445,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x45 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x45 ; GCN-NEXT: v_or_b32_e32 v3, v3, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x44 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x44 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1456,11 +1458,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x43 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x43 ; GCN-NEXT: v_or_b32_e32 v19, v19, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 3, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x42 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x42 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1468,11 +1470,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s0, 0x41 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x41 ; GCN-NEXT: v_or_b32_e32 v3, v3, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 64 +; GCN-NEXT: s_cmp_lg_u32 s2, 64 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1485,11 +1487,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v19 ; GCN-NEXT: v_and_b32_e32 v2, 15, v2 -; GCN-NEXT: s_cmp_lg_u32 s0, 55 +; GCN-NEXT: s_cmp_lg_u32 s2, 55 ; GCN-NEXT: v_or_b32_e32 v2, v2, v15 ; GCN-NEXT: v_mov_b32_e32 v15, s24 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 54 +; GCN-NEXT: s_cmp_lg_u32 s2, 54 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v16, s23 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1497,12 +1499,12 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 53 +; GCN-NEXT: s_cmp_lg_u32 s2, 53 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_mov_b32_e32 v16, s22 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 52 +; GCN-NEXT: s_cmp_lg_u32 s2, 52 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s21 @@ -1512,11 +1514,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 51 +; GCN-NEXT: s_cmp_lg_u32 s2, 51 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_mov_b32_e32 v16, s20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 50 +; GCN-NEXT: s_cmp_lg_u32 s2, 50 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1524,11 +1526,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 49 +; GCN-NEXT: s_cmp_lg_u32 s2, 49 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_mov_b32_e32 v17, s18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 48 +; GCN-NEXT: s_cmp_lg_u32 s2, 48 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1540,11 +1542,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 63 +; GCN-NEXT: s_cmp_lg_u32 s2, 63 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 62 +; GCN-NEXT: s_cmp_lg_u32 s2, 62 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s16 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1552,11 +1554,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 61 +; GCN-NEXT: s_cmp_lg_u32 s2, 61 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 60 +; GCN-NEXT: s_cmp_lg_u32 s2, 60 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1565,22 +1567,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 59 +; GCN-NEXT: s_cmp_lg_u32 s2, 59 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 58 +; GCN-NEXT: s_cmp_lg_u32 s2, 58 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: s_cmp_lg_u32 s0, 56 +; GCN-NEXT: s_cmp_lg_u32 s2, 56 ; GCN-NEXT: v_mov_b32_e32 v14, s16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 57 +; GCN-NEXT: s_cmp_lg_u32 s2, 57 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 1, s16 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc @@ -1594,11 +1596,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 ; GCN-NEXT: v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v14, v16, v14 -; GCN-NEXT: s_cmp_lg_u32 s0, 47 +; GCN-NEXT: s_cmp_lg_u32 s2, 47 ; GCN-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v14, 15, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 46 +; GCN-NEXT: s_cmp_lg_u32 s2, 46 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 14, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1606,11 +1608,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 45 +; GCN-NEXT: s_cmp_lg_u32 s2, 45 ; GCN-NEXT: v_or_b32_e32 v14, v14, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 13, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 44 +; GCN-NEXT: s_cmp_lg_u32 s2, 44 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 12, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1619,11 +1621,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 43 +; GCN-NEXT: s_cmp_lg_u32 s2, 43 ; GCN-NEXT: v_or_b32_e32 v14, v16, v14 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 11, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 42 +; GCN-NEXT: s_cmp_lg_u32 s2, 42 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 10, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1631,11 +1633,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 41 +; GCN-NEXT: s_cmp_lg_u32 s2, 41 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 9, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 40 +; GCN-NEXT: s_cmp_lg_u32 s2, 40 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 8, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1647,11 +1649,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GCN-NEXT: v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s0, 39 +; GCN-NEXT: s_cmp_lg_u32 s2, 39 ; GCN-NEXT: v_or_b32_e32 v16, v14, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 7, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 38 +; GCN-NEXT: s_cmp_lg_u32 s2, 38 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1659,11 +1661,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 37 +; GCN-NEXT: s_cmp_lg_u32 s2, 37 ; GCN-NEXT: v_or_b32_e32 v14, v14, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 36 +; GCN-NEXT: s_cmp_lg_u32 s2, 36 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1672,11 +1674,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 35 +; GCN-NEXT: s_cmp_lg_u32 s2, 35 ; GCN-NEXT: v_or_b32_e32 v17, v17, v14 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 3, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 34 +; GCN-NEXT: s_cmp_lg_u32 s2, 34 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1684,11 +1686,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmp_lg_u32 s0, 33 +; GCN-NEXT: s_cmp_lg_u32 s2, 33 ; GCN-NEXT: v_or_b32_e32 v18, v14, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 1, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 32 +; GCN-NEXT: s_cmp_lg_u32 s2, 32 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1702,11 +1704,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v1, 15, v1 ; GCN-NEXT: v_or_b32_e32 v1, v1, v17 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s0, 23 +; GCN-NEXT: s_cmp_lg_u32 s2, 23 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v15, s15 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 22 +; GCN-NEXT: s_cmp_lg_u32 s2, 22 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v16, s14 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1714,11 +1716,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 21 +; GCN-NEXT: s_cmp_lg_u32 s2, 21 ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_mov_b32_e32 v16, s13 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 20 +; GCN-NEXT: s_cmp_lg_u32 s2, 20 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s12 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1727,11 +1729,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 19 +; GCN-NEXT: s_cmp_lg_u32 s2, 19 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_mov_b32_e32 v16, s11 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 18 +; GCN-NEXT: s_cmp_lg_u32 s2, 18 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s10 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1739,11 +1741,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 17 +; GCN-NEXT: s_cmp_lg_u32 s2, 17 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_mov_b32_e32 v17, s9 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s8 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1755,24 +1757,24 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s0, 31 +; GCN-NEXT: s_cmp_lg_u32 s2, 31 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s1 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 30 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s1 +; GCN-NEXT: s_cmp_lg_u32 s2, 30 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s3 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 29 +; GCN-NEXT: s_cmp_lg_u32 s2, 29 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s1 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 28 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s1 +; GCN-NEXT: s_cmp_lg_u32 s2, 28 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s3 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc @@ -1780,24 +1782,24 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v17, v19, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 27 +; GCN-NEXT: s_cmp_lg_u32 s2, 27 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s1 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 26 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s1 +; GCN-NEXT: s_cmp_lg_u32 s2, 26 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s3 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: s_cmp_lg_u32 s0, 24 -; GCN-NEXT: v_mov_b32_e32 v18, s1 +; GCN-NEXT: s_cmp_lg_u32 s2, 24 +; GCN-NEXT: v_mov_b32_e32 v18, s3 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 25 +; GCN-NEXT: s_cmp_lg_u32 s2, 25 ; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s1 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s3 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc @@ -1809,11 +1811,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 ; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 15 +; GCN-NEXT: s_cmp_lg_u32 s2, 15 ; GCN-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v16, 15, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 14 +; GCN-NEXT: s_cmp_lg_u32 s2, 14 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 14, s4 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1821,11 +1823,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 13 +; GCN-NEXT: s_cmp_lg_u32 s2, 13 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 13, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 12 +; GCN-NEXT: s_cmp_lg_u32 s2, 12 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 12, s4 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1833,52 +1835,52 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: s_cmp_lg_u32 s0, 11 +; GCN-NEXT: s_cmp_lg_u32 s2, 11 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s4 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 10 +; GCN-NEXT: s_cmp_lg_u32 s2, 10 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 10, s4 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 9 +; GCN-NEXT: s_cmp_lg_u32 s2, 9 ; GCN-NEXT: v_lshrrev_b16_e64 v12, 9, s4 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 8 +; GCN-NEXT: s_cmp_lg_u32 s2, 8 ; GCN-NEXT: v_lshrrev_b16_e64 v11, 8, s4 ; GCN-NEXT: v_cndmask_b32_e32 v12, 1, v12, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 7 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 ; GCN-NEXT: v_lshrrev_b16_e64 v10, 7, s4 ; GCN-NEXT: v_cndmask_b32_e32 v11, 1, v11, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 6 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 ; GCN-NEXT: v_lshrrev_b16_e64 v9, 6, s4 ; GCN-NEXT: v_cndmask_b32_e32 v10, 1, v10, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 ; GCN-NEXT: v_lshrrev_b16_e64 v8, 5, s4 ; GCN-NEXT: v_cndmask_b32_e32 v9, 1, v9, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 ; GCN-NEXT: v_lshrrev_b16_e64 v7, 4, s4 ; GCN-NEXT: v_cndmask_b32_e32 v8, 1, v8, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_lshrrev_b16_e64 v6, 3, s4 ; GCN-NEXT: v_cndmask_b32_e32 v7, 1, v7, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 2 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_lshrrev_b16_e64 v5, 2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v6, 1, v6, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_lshrrev_b16_e64 v4, 1, s4 ; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1917,9 +1919,9 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v11, v16, v11 ; GCN-NEXT: v_or_b32_e32 v0, v0, v7 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 68427e8937bb9..2a8eac8712e52 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: insertelement_v2f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 @@ -40,7 +40,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -53,7 +53,7 @@ define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: insertelement_v2f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -71,7 +71,7 @@ define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -84,7 +84,7 @@ define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: insertelement_v2i32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -102,7 +102,7 @@ define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32 define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 @@ -115,7 +115,7 @@ define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: insertelement_v2i32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e7 @@ -135,8 +135,8 @@ define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32 define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -150,8 +150,8 @@ define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -170,8 +170,8 @@ define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s1, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -185,8 +185,8 @@ define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s1, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -205,8 +205,8 @@ define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s2, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -220,8 +220,8 @@ define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s2, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -240,8 +240,8 @@ define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -255,8 +255,8 @@ define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -275,8 +275,8 @@ define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind { ; SI-LABEL: insertelement_v4i32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_movk_i32 s0, 0x3e7 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -290,8 +290,8 @@ define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: insertelement_v4i32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_movk_i32 s0, 0x3e7 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -310,8 +310,8 @@ define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -323,8 +323,8 @@ define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x flo ; ; VI-LABEL: insertelement_v3f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -341,8 +341,8 @@ define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x flo define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 @@ -354,8 +354,8 @@ define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x flo ; ; VI-LABEL: insertelement_v3f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 @@ -497,8 +497,8 @@ define <12 x float> @insertelement_to_v12f32_undef() nounwind { define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -516,8 +516,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -540,9 +540,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; SI-NEXT: s_load_dword s8, s[6:7], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -564,9 +564,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[6:7], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -593,9 +593,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; SI-NEXT: s_load_dword s8, s[6:7], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -621,9 +621,9 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 ; ; VI-LABEL: dynamic_insertelement_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[6:7], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -654,9 +654,9 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s4, s[4:5], 0x10 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x10 ; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -677,9 +677,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x40 ; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -705,10 +705,10 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v9f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; SI-NEXT: s_load_dword s6, s[4:5], 0x18 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; SI-NEXT: s_load_dword s4, s[6:7], 0x18 +; SI-NEXT: s_load_dword s5, s[6:7], 0x20 ; SI-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -720,8 +720,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_mov_b32 m0, s5 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v0, v9 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 @@ -731,10 +731,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 ; ; VI-LABEL: dynamic_insertelement_v9f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x60 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; VI-NEXT: s_load_dword s4, s[6:7], 0x60 +; VI-NEXT: s_load_dword s5, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -745,8 +745,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s6 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: s_mov_b32 m0, s5 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, v9 @@ -762,10 +762,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v10f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 +; SI-NEXT: s_load_dword s6, s[6:7], 0x20 ; SI-NEXT: v_mov_b32_e32 v10, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -777,9 +777,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: v_mov_b32_e32 v9, s7 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: s_mov_b32 m0, s6 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v0, v10 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -789,10 +789,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v10f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x60 +; VI-NEXT: s_load_dword s6, s[6:7], 0x80 ; VI-NEXT: v_mov_b32_e32 v10, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -803,9 +803,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s6 -; VI-NEXT: v_mov_b32_e32 v9, s7 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_mov_b32 m0, s6 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, v10 @@ -821,10 +821,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v11f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 +; SI-NEXT: s_load_dword s4, s[6:7], 0x20 ; SI-NEXT: v_mov_b32_e32 v11, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,8 +849,8 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v11f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v11, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -859,8 +859,8 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 +; VI-NEXT: s_load_dword s4, s[6:7], 0x80 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -883,10 +883,10 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v12f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 +; SI-NEXT: s_load_dword s4, s[6:7], 0x20 ; SI-NEXT: v_mov_b32_e32 v12, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -912,8 +912,8 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v12f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v12, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -922,8 +922,8 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 +; VI-NEXT: s_load_dword s4, s[6:7], 0x80 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -947,9 +947,9 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 +; SI-NEXT: s_load_dword s4, s[6:7], 0x20 ; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -980,9 +980,9 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; VI-NEXT: s_load_dword s4, s[6:7], 0x80 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1018,8 +1018,8 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1034,8 +1034,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1055,9 +1055,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[4:5], 0x8 -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[6:7], 0x8 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1075,9 +1075,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[4:5], 0x20 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[6:7], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1100,10 +1100,10 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { ; SI-LABEL: dynamic_insertelement_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dword s8, s[4:5], 0x8 -; SI-NEXT: s_load_dword s9, s[4:5], 0x11 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dword s8, s[6:7], 0x8 +; SI-NEXT: s_load_dword s9, s[6:7], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1124,10 +1124,10 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 ; ; VI-LABEL: dynamic_insertelement_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dword s8, s[4:5], 0x20 -; VI-NEXT: s_load_dword s9, s[4:5], 0x44 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dword s8, s[6:7], 0x20 +; VI-NEXT: s_load_dword s9, s[6:7], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1153,9 +1153,9 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s4, s[4:5], 0x10 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x10 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1175,9 +1175,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1202,10 +1202,10 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v9i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; SI-NEXT: s_load_dword s6, s[4:5], 0x18 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; SI-NEXT: s_load_dword s4, s[6:7], 0x18 +; SI-NEXT: s_load_dword s5, s[6:7], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1217,8 +1217,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_mov_b32 m0, s5 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1227,10 +1227,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 ; ; VI-LABEL: dynamic_insertelement_v9i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x60 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; VI-NEXT: s_load_dword s4, s[6:7], 0x60 +; VI-NEXT: s_load_dword s5, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1242,8 +1242,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s6 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: s_mov_b32 m0, s5 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1257,10 +1257,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v10i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 +; SI-NEXT: s_load_dword s6, s[6:7], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1272,9 +1272,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: v_mov_b32_e32 v9, s7 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: s_mov_b32 m0, s6 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1283,10 +1283,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v10i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x60 +; VI-NEXT: s_load_dword s6, s[6:7], 0x80 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -1297,9 +1297,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s6 -; VI-NEXT: v_mov_b32_e32 v9, s7 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_mov_b32 m0, s6 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1314,10 +1314,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v11i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 +; SI-NEXT: s_load_dword s4, s[6:7], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1341,17 +1341,17 @@ define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v11i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s4, s[6:7], 0x80 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -1374,10 +1374,10 @@ define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v12i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 +; SI-NEXT: s_load_dword s4, s[6:7], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1402,17 +1402,17 @@ define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v12i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s4, s[6:7], 0x80 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -1436,9 +1436,9 @@ define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 -; SI-NEXT: s_load_dword s6, s[4:5], 0x20 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 +; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1458,7 +1458,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < ; SI-NEXT: v_mov_b32_e32 v13, s21 ; SI-NEXT: v_mov_b32_e32 v14, s22 ; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: s_mov_b32 m0, s6 +; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -1468,9 +1468,9 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v16i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; VI-NEXT: s_load_dword s6, s[4:5], 0x80 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; VI-NEXT: s_load_dword s4, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1490,7 +1490,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v13, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 ; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: s_mov_b32 m0, s6 +; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -1505,7 +1505,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1522,7 +1522,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1544,8 +1544,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s8, s[6:7], 0x4 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1565,8 +1565,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s8, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s8, s[6:7], 0x10 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1592,33 +1592,33 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s4, s[4:5], 0xa +; SI-NEXT: s_load_dword s4, s[6:7], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s5, s[6:7], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s6, 3 -; SI-NEXT: s_lshl_b32 s5, 0xff, s5 -; SI-NEXT: s_andn2_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s5, 0x505 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s4, s4, 3 +; SI-NEXT: s_lshl_b32 s4, 0xff, s4 +; SI-NEXT: s_andn2_b32 s5, s5, s4 +; SI-NEXT: s_and_b32 s4, s4, 0x505 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x28 +; VI-NEXT: s_load_dword s4, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s5, s[6:7], 0x28 ; VI-NEXT: v_mov_b32_e32 v0, 0xff ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s5, s6, 3 -; VI-NEXT: v_lshlrev_b16_e32 v0, s5, v0 +; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: v_lshlrev_b16_e32 v0, s4, v0 ; VI-NEXT: v_not_b32_e32 v1, v0 -; VI-NEXT: v_and_b32_e32 v1, s4, v1 +; VI-NEXT: v_and_b32_e32 v1, s5, v1 ; VI-NEXT: v_and_b32_e32 v0, 0x505, v0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1634,17 +1634,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s4, s[4:5], 0xa +; SI-NEXT: s_load_dword s4, s[6:7], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s5, s[6:7], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s6, 3 -; SI-NEXT: s_lshl_b32 s5, 0xff, s5 -; SI-NEXT: s_andn2_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s5, 0x5050505 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s4, s4, 3 +; SI-NEXT: s_lshl_b32 s4, 0xff, s4 +; SI-NEXT: s_andn2_b32 s5, s5, s4 +; SI-NEXT: s_and_b32 s4, s4, 0x5050505 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1654,17 +1654,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v3i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x28 +; VI-NEXT: s_load_dword s4, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s5, s[6:7], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s5, s6, 3 -; VI-NEXT: s_lshl_b32 s5, 0xff, s5 -; VI-NEXT: s_andn2_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s5, 0x5050505 -; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, 0xff, s4 +; VI-NEXT: s_andn2_b32 s5, s5, s4 +; VI-NEXT: s_and_b32 s4, s4, 0x5050505 +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1679,34 +1679,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s4, s[4:5], 0xa +; SI-NEXT: s_load_dword s4, s[6:7], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s5, s[6:7], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s6, 3 -; SI-NEXT: s_lshl_b32 s5, 0xff, s5 -; SI-NEXT: s_andn2_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s5, 0x5050505 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s4, s4, 3 +; SI-NEXT: s_lshl_b32 s4, 0xff, s4 +; SI-NEXT: s_andn2_b32 s5, s5, s4 +; SI-NEXT: s_and_b32 s4, s4, 0x5050505 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x28 +; VI-NEXT: s_load_dword s4, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s5, s[6:7], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s5, s6, 3 -; VI-NEXT: s_lshl_b32 s5, 0xff, s5 -; VI-NEXT: s_andn2_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s5, 0x5050505 -; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, 0xff, s4 +; VI-NEXT: s_andn2_b32 s5, s5, s4 +; VI-NEXT: s_and_b32 s4, s4, 0x5050505 +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1718,46 +1718,46 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %a.ptr, i32 %b) nounwind { ; SI-LABEL: s_dynamic_insertelement_v8i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dword s8, s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x4 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_lshl_b32 s0, s8, 3 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_lshl_b32 s0, s4, 3 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 -; SI-NEXT: s_and_b32 s9, s1, 0x5050505 +; SI-NEXT: s_and_b32 s5, s1, 0x5050505 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; SI-NEXT: s_and_b32 s8, s0, 0x5050505 -; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] +; SI-NEXT: s_and_b32 s4, s0, 0x5050505 +; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_dynamic_insertelement_v8i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s8, s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s7, 0x1100f000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_lshl_b32 s0, s8, 3 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_lshl_b32 s0, s4, 3 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 -; VI-NEXT: s_and_b32 s9, s1, 0x5050505 +; VI-NEXT: s_and_b32 s5, s1, 0x5050505 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; VI-NEXT: s_and_b32 s8, s0, 0x5050505 -; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] +; VI-NEXT: s_and_b32 s4, s0, 0x5050505 +; VI-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm %a = load <8 x i8>, ptr addrspace(4) %a.ptr, align 4 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b @@ -1768,196 +1768,196 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 -; SI-NEXT: s_load_dword s6, s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x4 +; SI-NEXT: s_load_dword s4, s[6:7], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s11, 24 -; SI-NEXT: s_cmp_lg_u32 s6, 15 -; SI-NEXT: s_cselect_b32 s4, s4, 5 -; SI-NEXT: s_lshl_b32 s4, s4, 24 -; SI-NEXT: s_lshr_b32 s5, s11, 16 -; SI-NEXT: s_cmp_lg_u32 s6, 14 +; SI-NEXT: s_lshr_b32 s5, s11, 24 +; SI-NEXT: s_cmp_lg_u32 s4, 15 ; SI-NEXT: s_cselect_b32 s5, s5, 5 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshr_b32 s5, s11, 8 -; SI-NEXT: s_cmp_lg_u32 s6, 13 -; SI-NEXT: s_cselect_b32 s5, s5, 5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_cmp_lg_u32 s6, 12 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshr_b32 s6, s11, 16 +; SI-NEXT: s_cmp_lg_u32 s4, 14 +; SI-NEXT: s_cselect_b32 s6, s6, 5 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b32 s6, s11, 8 +; SI-NEXT: s_cmp_lg_u32 s4, 13 +; SI-NEXT: s_cselect_b32 s6, s6, 5 +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_cmp_lg_u32 s4, 12 ; SI-NEXT: s_cselect_b32 s7, s11, 5 ; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: s_or_b32 s5, s7, s5 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_lshr_b32 s5, s10, 24 -; SI-NEXT: s_cmp_lg_u32 s6, 11 -; SI-NEXT: s_cselect_b32 s5, s5, 5 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshr_b32 s6, s10, 24 +; SI-NEXT: s_cmp_lg_u32 s4, 11 +; SI-NEXT: s_cselect_b32 s6, s6, 5 +; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_lshr_b32 s7, s10, 16 -; SI-NEXT: s_cmp_lg_u32 s6, 10 +; SI-NEXT: s_cmp_lg_u32 s4, 10 ; SI-NEXT: s_cselect_b32 s7, s7, 5 ; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_lshr_b32 s7, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s6, 9 +; SI-NEXT: s_cmp_lg_u32 s4, 9 ; SI-NEXT: s_cselect_b32 s7, s7, 5 ; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_cmp_lg_u32 s6, 8 +; SI-NEXT: s_cmp_lg_u32 s4, 8 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_or_b32 s7, s10, s7 ; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_lshr_b32 s7, s9, 24 -; SI-NEXT: s_cmp_lg_u32 s6, 7 +; SI-NEXT: s_cmp_lg_u32 s4, 7 ; SI-NEXT: s_cselect_b32 s7, s7, 5 ; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_lshr_b32 s10, s9, 16 -; SI-NEXT: s_cmp_lg_u32 s6, 6 +; SI-NEXT: s_cmp_lg_u32 s4, 6 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_or_b32 s7, s7, s10 ; SI-NEXT: s_lshr_b32 s10, s9, 8 -; SI-NEXT: s_cmp_lg_u32 s6, 5 +; SI-NEXT: s_cmp_lg_u32 s4, 5 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s6, 4 +; SI-NEXT: s_cmp_lg_u32 s4, 4 ; SI-NEXT: s_cselect_b32 s9, s9, 5 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_lshr_b32 s9, s8, 24 -; SI-NEXT: s_cmp_lg_u32 s6, 3 +; SI-NEXT: s_cmp_lg_u32 s4, 3 ; SI-NEXT: s_cselect_b32 s9, s9, 5 ; SI-NEXT: s_lshl_b32 s9, s9, 24 ; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_cmp_lg_u32 s6, 2 +; SI-NEXT: s_cmp_lg_u32 s4, 2 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_lshr_b32 s10, s8, 8 -; SI-NEXT: s_cmp_lg_u32 s6, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s6, 0 -; SI-NEXT: s_cselect_b32 s6, s8, 5 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_or_b32 s6, s6, s10 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s6, s6, s9 -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cselect_b32 s4, s8, 5 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_or_b32 s4, s4, s10 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s9 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s6, s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x10 +; VI-NEXT: s_load_dword s4, s[6:7], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s11, 24 -; VI-NEXT: s_cmp_lg_u32 s6, 15 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_lshr_b32 s5, s11, 24 +; VI-NEXT: s_cmp_lg_u32 s4, 15 +; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s4, s11, 16 -; VI-NEXT: s_cmp_lg_u32 s6, 14 +; VI-NEXT: s_lshr_b32 s5, s11, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 14 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s4, s11, 8 +; VI-NEXT: s_lshr_b32 s5, s11, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s6, 13 +; VI-NEXT: s_cmp_lg_u32 s4, 13 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s6, 12 +; VI-NEXT: s_cmp_lg_u32 s4, 12 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; VI-NEXT: s_lshr_b32 s4, s10, 24 +; VI-NEXT: s_lshr_b32 s5, s10, 24 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s6, 11 +; VI-NEXT: s_cmp_lg_u32 s4, 11 ; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s4, s10, 16 -; VI-NEXT: s_cmp_lg_u32 s6, 10 +; VI-NEXT: s_lshr_b32 s5, s10, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 10 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s4, s10, 8 +; VI-NEXT: s_lshr_b32 s5, s10, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s6, 9 +; VI-NEXT: s_cmp_lg_u32 s4, 9 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s6, 8 +; VI-NEXT: s_cmp_lg_u32 s4, 8 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; VI-NEXT: s_lshr_b32 s4, s9, 24 +; VI-NEXT: s_lshr_b32 s5, s9, 24 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s6, 7 +; VI-NEXT: s_cmp_lg_u32 s4, 7 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s4, s9, 16 -; VI-NEXT: s_cmp_lg_u32 s6, 6 +; VI-NEXT: s_lshr_b32 s5, s9, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 6 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s4, s9, 8 +; VI-NEXT: s_lshr_b32 s5, s9, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s6, 5 +; VI-NEXT: s_cmp_lg_u32 s4, 5 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s6, 4 +; VI-NEXT: s_cmp_lg_u32 s4, 4 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v4, s9 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: s_lshr_b32 s4, s8, 24 +; VI-NEXT: s_lshr_b32 s5, s8, 24 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s6, 3 +; VI-NEXT: s_cmp_lg_u32 s4, 3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s4, s8, 16 -; VI-NEXT: s_cmp_lg_u32 s6, 2 +; VI-NEXT: s_lshr_b32 s5, s8, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 2 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s4, s8, 8 +; VI-NEXT: s_lshr_b32 s5, s8, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: s_cmp_lg_u32 s6, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v5, s8 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1977,26 +1977,26 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <1 define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: insert_split_bb: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[4:5], 0x4 -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s6, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB42_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s7, s[2:3], 0x1 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_load_dword s5, s[2:3], 0x1 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz .LBB42_3 ; SI-NEXT: .LBB42_2: ; %if -; SI-NEXT: s_load_dword s7, s[2:3], 0x0 +; SI-NEXT: s_load_dword s5, s[2:3], 0x0 ; SI-NEXT: .LBB42_3: ; %endif ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB42_4: @@ -2004,23 +2004,23 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: insert_split_bb: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s6, s[4:5], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cbranch_scc0 .LBB42_4 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_load_dword s7, s[2:3], 0x4 +; VI-NEXT: s_load_dword s5, s[2:3], 0x4 ; VI-NEXT: s_cbranch_execnz .LBB42_3 ; VI-NEXT: .LBB42_2: ; %if ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x0 +; VI-NEXT: s_load_dword s5, s[2:3], 0x0 ; VI-NEXT: .LBB42_3: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB42_4: @@ -2050,9 +2050,9 @@ endif: define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[4:5], 0x18 -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[6:7], 0x18 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2071,9 +2071,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[4:5], 0x60 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[6:7], 0x60 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x30 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2097,9 +2097,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[4:5], 0x8 -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[6:7], 0x8 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2118,9 +2118,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[4:5], 0x20 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[6:7], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2144,20 +2144,20 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x10 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xc +; SI-NEXT: s_load_dword s12, s[6:7], 0x10 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x8 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0xc ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s6, 1 +; SI-NEXT: s_cmp_eq_u32 s12, 1 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_cselect_b32 s7, 0, s11 -; SI-NEXT: s_cselect_b32 s10, 5, s10 -; SI-NEXT: s_cmp_eq_u32 s6, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s11 +; SI-NEXT: s_cselect_b32 s7, 5, s10 +; SI-NEXT: s_cmp_eq_u32 s12, 0 ; SI-NEXT: s_cselect_b32 s9, 0, s9 ; SI-NEXT: s_cselect_b32 s8, 5, s8 -; SI-NEXT: s_cmp_eq_u32 s6, 2 +; SI-NEXT: s_cmp_eq_u32 s12, 2 ; SI-NEXT: s_cselect_b32 s5, 0, s5 ; SI-NEXT: s_cselect_b32 s4, 5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -2165,27 +2165,27 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x30 +; VI-NEXT: s_load_dword s12, s[6:7], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x20 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x30 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s6, 1 +; VI-NEXT: s_cmp_eq_u32 s12, 1 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_cselect_b32 s7, 0, s11 -; VI-NEXT: s_cselect_b32 s10, 5, s10 -; VI-NEXT: s_cmp_eq_u32 s6, 0 +; VI-NEXT: s_cselect_b32 s6, 0, s11 +; VI-NEXT: s_cselect_b32 s7, 5, s10 +; VI-NEXT: s_cmp_eq_u32 s12, 0 ; VI-NEXT: s_cselect_b32 s9, 0, s9 ; VI-NEXT: s_cselect_b32 s8, 5, s8 -; VI-NEXT: s_cmp_eq_u32 s6, 2 +; VI-NEXT: s_cmp_eq_u32 s12, 2 ; VI-NEXT: s_cselect_b32 s5, 0, s5 ; VI-NEXT: s_cselect_b32 s4, 5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2193,8 +2193,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v3, s6 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i64> %a, i64 5, i32 %b @@ -2205,67 +2205,67 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x10 -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x10 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s6, 1 -; SI-NEXT: s_cselect_b32 s4, 0x40200000, s11 -; SI-NEXT: s_cselect_b32 s5, 0, s10 -; SI-NEXT: s_cmp_eq_u32 s6, 0 +; SI-NEXT: s_cmp_eq_u32 s4, 1 +; SI-NEXT: s_cselect_b32 s5, 0x40200000, s11 +; SI-NEXT: s_cselect_b32 s6, 0, s10 +; SI-NEXT: s_cmp_eq_u32 s4, 0 ; SI-NEXT: s_cselect_b32 s7, 0x40200000, s9 ; SI-NEXT: s_cselect_b32 s8, 0, s8 -; SI-NEXT: s_cmp_eq_u32 s6, 3 +; SI-NEXT: s_cmp_eq_u32 s4, 3 ; SI-NEXT: s_cselect_b32 s9, 0x40200000, s15 ; SI-NEXT: s_cselect_b32 s10, 0, s14 -; SI-NEXT: s_cmp_eq_u32 s6, 2 -; SI-NEXT: s_cselect_b32 s6, 0x40200000, s13 +; SI-NEXT: s_cmp_eq_u32 s4, 2 +; SI-NEXT: s_cselect_b32 s4, 0x40200000, s13 ; SI-NEXT: s_cselect_b32 s11, 0, s12 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x40 -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x40 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s6, 1 -; VI-NEXT: s_cselect_b32 s4, 0x40200000, s11 -; VI-NEXT: s_cselect_b32 s5, 0, s10 -; VI-NEXT: s_cmp_eq_u32 s6, 0 +; VI-NEXT: s_cmp_eq_u32 s4, 1 +; VI-NEXT: s_cselect_b32 s5, 0x40200000, s11 +; VI-NEXT: s_cselect_b32 s6, 0, s10 +; VI-NEXT: s_cmp_eq_u32 s4, 0 ; VI-NEXT: s_cselect_b32 s7, 0x40200000, s9 ; VI-NEXT: s_cselect_b32 s8, 0, s8 -; VI-NEXT: s_cmp_eq_u32 s6, 3 +; VI-NEXT: s_cmp_eq_u32 s4, 3 ; VI-NEXT: s_cselect_b32 s9, 0x40200000, s15 ; VI-NEXT: s_cselect_b32 s10, 0, s14 -; VI-NEXT: s_cmp_eq_u32 s6, 2 -; VI-NEXT: s_cselect_b32 s6, 0x40200000, s13 +; VI-NEXT: s_cmp_eq_u32 s4, 2 +; VI-NEXT: s_cselect_b32 s4, 0x40200000, s13 ; VI-NEXT: s_cselect_b32 s11, 0, s12 ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x double> %a, double 8.0, i32 %b @@ -2276,13 +2276,13 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 { ; SI-LABEL: dynamic_insertelement_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x20 -; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s6, 1 +; SI-NEXT: s_lshl_b32 s4, s4, 1 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 @@ -2311,13 +2311,13 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x80 -; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s6, 1 +; VI-NEXT: s_lshl_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index c9b01eb5a9725..3135addec1618 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: s_insertelement_v2bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_insertelement_v2bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: s_insertelement_v2bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: s_insertelement_v2bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -58,7 +58,6 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0 store <2 x bfloat> %vecins, ptr addrspace(1) %out @@ -68,7 +67,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: s_insertelement_v2bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -82,7 +81,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_insertelement_v2bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -96,7 +95,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: s_insertelement_v2bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -108,7 +107,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: s_insertelement_v2bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -117,7 +116,6 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1 store <2 x bfloat> %vecins, ptr addrspace(1) %out @@ -127,7 +125,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -144,7 +142,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v2bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -162,7 +160,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v2bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x40a0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -175,7 +173,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v2bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x40a0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -185,7 +184,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v1 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -199,7 +197,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_0_inlineimm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -216,7 +214,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2bf16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -234,7 +232,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; GFX900-LABEL: v_insertelement_v2bf16_0_inlineimm: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dword v1, v0, s[2:3] @@ -246,7 +244,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; GFX940-LABEL: v_insertelement_v2bf16_0_inlineimm: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dword v1, v0, s[2:3] @@ -255,7 +254,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; GFX940-NEXT: v_bfi_b32 v1, s2, 53, v1 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -269,7 +267,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -286,7 +284,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v2bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -304,7 +302,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v2bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -317,7 +315,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v2bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -327,7 +326,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_perm_b32 v1, s2, v1, v2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -341,7 +339,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_1_inlineimm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -358,7 +356,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2bf16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -376,7 +374,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; GFX900-LABEL: v_insertelement_v2bf16_1_inlineimm: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -388,7 +386,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; GFX940-LABEL: v_insertelement_v2bf16_1_inlineimm: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -397,7 +396,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; GFX940-NEXT: v_perm_b32 v1, 35, v1, v2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -411,8 +409,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 { ; SI-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; SI-NEXT: s_mov_b32 s11, 0x100f000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -434,8 +432,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; VI-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -461,11 +459,11 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; GFX900-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dword v1, v0, s[6:7] +; GFX900-NEXT: global_load_dword v1, v0, s[4:5] ; GFX900-NEXT: global_load_dword v2, v0, s[2:3] ; GFX900-NEXT: s_mov_b32 s2, 0xffff ; GFX900-NEXT: s_waitcnt vmcnt(1) @@ -479,13 +477,14 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; GFX940-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: s_mov_b32 s0, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-NEXT: global_load_dword v1, v0, s[0:1] ; GFX940-NEXT: global_load_dword v2, v0, s[6:7] +; GFX940-NEXT: s_mov_b32 s0, 0xffff ; GFX940-NEXT: s_waitcnt vmcnt(1) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX940-NEXT: v_lshlrev_b32_e64 v1, v1, s0 @@ -494,7 +493,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; GFX940-NEXT: v_bfi_b32 v1, v1, s0, v2 ; GFX940-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -510,27 +508,27 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dword s8, s[4:5], 0xc -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0xc +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s5, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfi_b32 v2, s4, v4, v2 +; SI-NEXT: v_bfi_b32 v2, s5, v4, v2 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -548,13 +546,13 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX900-NEXT: s_load_dword s6, s[4:5], 0x30 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dword s4, s[6:7], 0x30 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_mov_b32 s2, 0xffff -; GFX900-NEXT: v_mov_b32_e32 v3, s6 +; GFX900-NEXT: v_mov_b32_e32 v3, s4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_bfi_b32 v0, s2, v3, v0 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -562,18 +560,18 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v4bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x30 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: s_mov_b32 s1, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, s0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v0, s0, v3, v0 +; GFX940-NEXT: v_bfi_b32 v0, s1, v3, v0 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -589,17 +587,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dword s8, s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x4 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s8, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 @@ -608,8 +606,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v4bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -627,30 +625,30 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_perm_b32 v0, s6, v0, v3 +; GFX900-NEXT: v_perm_b32 v0, s4, v0, v3 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-NEXT: s_endpgm ; ; GFX940-LABEL: v_insertelement_v4bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v0, s2, v0, v3 +; GFX940-NEXT: v_perm_b32 v0, s0, v0, v3 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -666,27 +664,27 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dword s8, s[4:5], 0xc -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0xc +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s5, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfi_b32 v3, s4, v4, v3 +; SI-NEXT: v_bfi_b32 v3, s5, v4, v3 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4bf16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -704,13 +702,13 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_2: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX900-NEXT: s_load_dword s6, s[4:5], 0x30 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dword s4, s[6:7], 0x30 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_mov_b32 s2, 0xffff -; GFX900-NEXT: v_mov_b32_e32 v3, s6 +; GFX900-NEXT: v_mov_b32_e32 v3, s4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -718,18 +716,18 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v4bf16_2: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x30 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: s_mov_b32 s1, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, s0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v1, s0, v3, v1 +; GFX940-NEXT: v_bfi_b32 v1, s1, v3, v1 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -745,17 +743,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dword s8, s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x4 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s8, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 @@ -764,8 +762,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v4bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -783,30 +781,30 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_perm_b32 v1, s6, v1, v3 +; GFX900-NEXT: v_perm_b32 v1, s4, v1, v3 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-NEXT: s_endpgm ; ; GFX940-LABEL: v_insertelement_v4bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v1, s2, v1, v3 +; GFX940-NEXT: v_perm_b32 v1, s0, v1, v3 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -822,23 +820,23 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 { ; SI-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s8, 16 -; SI-NEXT: s_and_b32 s5, s8, 0xffff -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_lshl_b32 s6, s9, 4 -; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: s_lshl_b64 s[4:5], 0xffff, s6 -; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_lshl_b32 s6, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 4 +; SI-NEXT: s_or_b32 s6, s4, s6 +; SI-NEXT: s_lshl_b64 s[4:5], 0xffff, s5 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfi_b32 v3, s5, v4, v3 ; SI-NEXT: v_bfi_b32 v2, s4, v5, v2 @@ -847,8 +845,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -873,13 +871,13 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; GFX900-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX900-NEXT: s_lshl_b32 s2, s7, 4 -; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX900-NEXT: s_lshl_b32 s2, s5, 4 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 ; GFX900-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 ; GFX900-NEXT: v_mov_b32_e32 v3, s4 ; GFX900-NEXT: v_mov_b32_e32 v4, s4 @@ -891,14 +889,15 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; GFX940-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: s_lshl_b32 s0, s3, 4 -; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX940-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 +; GFX940-NEXT: s_lshl_b32 s1, s1, 4 +; GFX940-NEXT: s_pack_ll_b32_b16 s2, s0, s0 +; GFX940-NEXT: s_lshl_b64 s[0:1], 0xffff, s1 ; GFX940-NEXT: v_mov_b32_e32 v3, s2 ; GFX940-NEXT: v_mov_b32_e32 v4, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -906,7 +905,6 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; GFX940-NEXT: v_bfi_b32 v0, s0, v4, v0 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -922,17 +920,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; SI-LABEL: v_insertelement_v8bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dword s8, s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x4 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s8, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -941,8 +939,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v8bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -961,8 +959,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v8bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -972,27 +970,27 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; GFX900-NEXT: v_bfi_b32 v3, s2, v3, v3 ; GFX900-NEXT: v_bfi_b32 v2, s2, v2, v2 ; GFX900-NEXT: v_bfi_b32 v0, s2, v0, v0 -; GFX900-NEXT: v_perm_b32 v1, s6, v1, v5 +; GFX900-NEXT: v_perm_b32 v1, s4, v1, v5 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-NEXT: s_endpgm ; ; GFX940-LABEL: v_insertelement_v8bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100 +; GFX940-NEXT: s_mov_b32 s1, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v3, s0, v3, v3 -; GFX940-NEXT: v_bfi_b32 v2, s0, v2, v2 -; GFX940-NEXT: v_bfi_b32 v0, s0, v0, v0 -; GFX940-NEXT: v_perm_b32 v1, s2, v1, v5 +; GFX940-NEXT: v_bfi_b32 v3, s1, v3, v3 +; GFX940-NEXT: v_bfi_b32 v2, s1, v2, v2 +; GFX940-NEXT: v_bfi_b32 v0, s1, v0, v0 +; GFX940-NEXT: v_perm_b32 v1, s0, v1, v5 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1008,48 +1006,48 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v8bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 -; SI-NEXT: s_cmp_eq_u32 s9, 6 -; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 +; SI-NEXT: s_cmp_eq_u32 s5, 6 +; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s9, 7 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_cmp_eq_u32 s5, 7 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s9, 4 +; SI-NEXT: s_cmp_eq_u32 s5, 4 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s9, 5 +; SI-NEXT: s_cmp_eq_u32 s5, 5 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s9, 2 +; SI-NEXT: s_cmp_eq_u32 s5, 2 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s9, 3 +; SI-NEXT: s_cmp_eq_u32 s5, 3 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v7, v3 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s9, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 0 ; SI-NEXT: v_or_b32_e32 v2, v2, v7 ; SI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s9, 1 +; SI-NEXT: s_cmp_eq_u32 s5, 1 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1065,8 +1063,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v8bf16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1119,40 +1117,40 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; GFX900-LABEL: v_insertelement_v8bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX900-NEXT: s_cmp_eq_u32 s7, 6 -; GFX900-NEXT: v_mov_b32_e32 v5, s6 +; GFX900-NEXT: s_cmp_eq_u32 s5, 6 +; GFX900-NEXT: v_mov_b32_e32 v5, s4 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 7 +; GFX900-NEXT: s_cmp_eq_u32 s5, 7 ; GFX900-NEXT: s_mov_b32 s2, 0x5040100 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc ; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 4 +; GFX900-NEXT: s_cmp_eq_u32 s5, 4 ; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 5 +; GFX900-NEXT: s_cmp_eq_u32 s5, 5 ; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 2 +; GFX900-NEXT: s_cmp_eq_u32 s5, 2 ; GFX900-NEXT: v_perm_b32 v3, v3, v6, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 3 +; GFX900-NEXT: s_cmp_eq_u32 s5, 3 ; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 0 +; GFX900-NEXT: s_cmp_eq_u32 s5, 0 ; GFX900-NEXT: v_perm_b32 v2, v6, v2, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 1 +; GFX900-NEXT: s_cmp_eq_u32 s5, 1 ; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1164,49 +1162,49 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; GFX940-LABEL: v_insertelement_v8bf16_dynamic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] -; GFX940-NEXT: s_cmp_eq_u32 s3, 6 -; GFX940-NEXT: v_mov_b32_e32 v5, s2 +; GFX940-NEXT: s_cmp_eq_u32 s1, 6 +; GFX940-NEXT: v_mov_b32_e32 v5, s0 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 7 +; GFX940-NEXT: s_cmp_eq_u32 s1, 7 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc ; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 4 +; GFX940-NEXT: s_cmp_eq_u32 s1, 4 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 5 +; GFX940-NEXT: s_cmp_eq_u32 s1, 5 ; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 2 -; GFX940-NEXT: v_perm_b32 v3, v3, v6, s0 +; GFX940-NEXT: s_cmp_eq_u32 s1, 2 +; GFX940-NEXT: v_perm_b32 v3, v3, v6, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 3 +; GFX940-NEXT: s_cmp_eq_u32 s1, 3 ; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 0 -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s0 +; GFX940-NEXT: s_cmp_eq_u32 s1, 0 +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 1 +; GFX940-NEXT: s_cmp_eq_u32 s1, 1 ; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX940-NEXT: v_perm_b32 v1, v6, v1, s0 -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s0 +; GFX940-NEXT: v_perm_b32 v1, v6, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1222,18 +1220,18 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; SI-LABEL: v_insertelement_v16bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_load_dword s8, s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x4 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v9, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -1244,8 +1242,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_insertelement_v16bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1271,15 +1269,15 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; GFX900-LABEL: v_insertelement_v16bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX900-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_perm_b32 v1, s6, v1, v9 +; GFX900-NEXT: v_perm_b32 v1, s4, v1, v9 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1287,20 +1285,20 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; GFX940-LABEL: v_insertelement_v16bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX940-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 ; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_perm_b32 v1, s2, v1, v9 +; GFX940-NEXT: v_perm_b32 v1, s0, v1, v9 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1316,22 +1314,21 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v16bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; SI-NEXT: s_mov_b32 s11, 0x100f000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 -; SI-NEXT: v_mov_b32_e32 v5, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v5, 0 ; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[8:11], 0 addr64 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s5, 6 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s5, 7 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cndmask_b32_e32 v11, v10, v6, vcc ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 @@ -1417,8 +1414,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; VI-LABEL: v_insertelement_v16bf16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -1514,74 +1511,74 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX900-LABEL: v_insertelement_v16bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 -; GFX900-NEXT: s_cmp_eq_u32 s7, 6 -; GFX900-NEXT: v_mov_b32_e32 v9, s6 +; GFX900-NEXT: s_cmp_eq_u32 s5, 6 +; GFX900-NEXT: v_mov_b32_e32 v9, s4 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 7 +; GFX900-NEXT: s_cmp_eq_u32 s5, 7 ; GFX900-NEXT: s_mov_b32 s2, 0x5040100 ; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc ; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 4 +; GFX900-NEXT: s_cmp_eq_u32 s5, 4 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 5 +; GFX900-NEXT: s_cmp_eq_u32 s5, 5 ; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 2 +; GFX900-NEXT: s_cmp_eq_u32 s5, 2 ; GFX900-NEXT: v_perm_b32 v4, v4, v10, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 3 +; GFX900-NEXT: s_cmp_eq_u32 s5, 3 ; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 0 +; GFX900-NEXT: s_cmp_eq_u32 s5, 0 ; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 1 +; GFX900-NEXT: s_cmp_eq_u32 s5, 1 ; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 14 +; GFX900-NEXT: s_cmp_eq_u32 s5, 14 ; GFX900-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 15 +; GFX900-NEXT: s_cmp_eq_u32 s5, 15 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 12 +; GFX900-NEXT: s_cmp_eq_u32 s5, 12 ; GFX900-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 13 +; GFX900-NEXT: s_cmp_eq_u32 s5, 13 ; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 10 +; GFX900-NEXT: s_cmp_eq_u32 s5, 10 ; GFX900-NEXT: v_perm_b32 v8, v10, v8, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 11 +; GFX900-NEXT: s_cmp_eq_u32 s5, 11 ; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 8 +; GFX900-NEXT: s_cmp_eq_u32 s5, 8 ; GFX900-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s7, 9 +; GFX900-NEXT: s_cmp_eq_u32 s5, 9 ; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1594,84 +1591,84 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX940-LABEL: v_insertelement_v16bf16_dynamic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 -; GFX940-NEXT: s_cmp_eq_u32 s3, 6 -; GFX940-NEXT: v_mov_b32_e32 v9, s2 +; GFX940-NEXT: s_cmp_eq_u32 s1, 6 +; GFX940-NEXT: v_mov_b32_e32 v9, s0 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 7 +; GFX940-NEXT: s_cmp_eq_u32 s1, 7 ; GFX940-NEXT: s_waitcnt vmcnt(1) ; GFX940-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc ; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 4 +; GFX940-NEXT: s_cmp_eq_u32 s1, 4 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 5 +; GFX940-NEXT: s_cmp_eq_u32 s1, 5 ; GFX940-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 2 -; GFX940-NEXT: v_perm_b32 v3, v3, v10, s0 +; GFX940-NEXT: s_cmp_eq_u32 s1, 2 +; GFX940-NEXT: v_perm_b32 v3, v3, v10, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 3 +; GFX940-NEXT: s_cmp_eq_u32 s1, 3 ; GFX940-NEXT: v_lshrrev_b32_e32 v12, 16, v1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 0 -; GFX940-NEXT: v_perm_b32 v2, v10, v2, s0 +; GFX940-NEXT: s_cmp_eq_u32 s1, 0 +; GFX940-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 1 +; GFX940-NEXT: s_cmp_eq_u32 s1, 1 ; GFX940-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 14 -; GFX940-NEXT: v_perm_b32 v1, v10, v1, s0 +; GFX940-NEXT: s_cmp_eq_u32 s1, 14 +; GFX940-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 15 +; GFX940-NEXT: s_cmp_eq_u32 s1, 15 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; GFX940-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 12 -; GFX940-NEXT: v_perm_b32 v0, v10, v0, s0 +; GFX940-NEXT: s_cmp_eq_u32 s1, 12 +; GFX940-NEXT: v_perm_b32 v0, v10, v0, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 13 +; GFX940-NEXT: s_cmp_eq_u32 s1, 13 ; GFX940-NEXT: v_lshrrev_b32_e32 v15, 16, v6 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 10 -; GFX940-NEXT: v_perm_b32 v7, v10, v7, s0 +; GFX940-NEXT: s_cmp_eq_u32 s1, 10 +; GFX940-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 11 +; GFX940-NEXT: s_cmp_eq_u32 s1, 11 ; GFX940-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 8 -; GFX940-NEXT: v_perm_b32 v6, v10, v6, s0 +; GFX940-NEXT: s_cmp_eq_u32 s1, 8 +; GFX940-NEXT: v_perm_b32 v6, v10, v6, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s3, 9 +; GFX940-NEXT: s_cmp_eq_u32 s1, 9 ; GFX940-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX940-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX940-NEXT: v_perm_b32 v5, v10, v5, s0 -; GFX940-NEXT: v_perm_b32 v4, v9, v4, s0 +; GFX940-NEXT: v_perm_b32 v5, v10, v5, s2 +; GFX940-NEXT: v_perm_b32 v4, v9, v4, s2 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm -; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 1ba2491d2210e..647870f0e0897 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2i16_0: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2i16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -54,21 +54,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s2, s6, s2 +; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_0_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -83,8 +83,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; ; CI-LABEL: s_insertelement_v2i16_0_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; GFX11-LABEL: s_insertelement_v2i16_0_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -121,14 +121,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: ;;#ASMSTART @@ -138,8 +138,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; ; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -179,8 +179,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -207,21 +207,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s2, s6, s2 +; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -235,8 +235,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; CI-LABEL: s_insertelement_v2i16_0_reghi: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -251,8 +251,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; GFX11-LABEL: s_insertelement_v2i16_0_reghi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -274,12 +274,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s3, s6, 16 +; GFX9-NEXT: s_lshr_b32 s3, s4, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -291,8 +291,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; ; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -310,8 +310,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; ; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -330,8 +330,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 @@ -359,12 +359,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s3, s6, 16 +; GFX9-NEXT: s_lshr_b32 s3, s4, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2 @@ -380,8 +380,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; ; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -402,8 +402,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; ; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -425,8 +425,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 @@ -462,7 +462,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -474,7 +474,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2i16_1: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -488,7 +488,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2i16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -508,21 +508,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_1_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_1_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -537,8 +537,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; ; CI-LABEL: s_insertelement_v2i16_1_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -554,8 +554,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; GFX11-LABEL: s_insertelement_v2i16_1_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -575,7 +575,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -588,7 +588,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2f16_0: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -602,7 +602,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -623,7 +623,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -635,7 +635,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2f16_1: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -649,7 +649,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -669,7 +669,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -682,7 +682,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2i16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2i16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -718,7 +718,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2i16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -742,21 +744,21 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %elt.arg) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, v1, s6, v2 +; GFX9-NEXT: v_perm_b32 v1, v1, s4, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -774,8 +776,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v2i16_0_reghi: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -793,9 +795,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v2i16_0_reghi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -819,7 +824,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -831,7 +836,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2i16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -849,7 +854,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2i16_0_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -867,7 +872,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -891,7 +898,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -904,7 +911,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2i16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -922,7 +929,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2i16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -940,7 +947,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2i16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -964,7 +973,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -976,7 +985,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2i16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -994,7 +1003,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2i16_1_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1012,7 +1021,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1035,7 +1046,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1048,7 +1059,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1066,7 +1077,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2f16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1084,7 +1095,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1108,7 +1121,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1120,7 +1133,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2f16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1138,7 +1151,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2f16_0_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1156,7 +1169,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1179,7 +1194,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1192,7 +1207,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1210,7 +1225,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2f16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1228,7 +1243,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1252,7 +1269,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1264,7 +1281,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2f16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1282,7 +1299,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2f16_1_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1300,7 +1317,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1324,16 +1343,16 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(4) %idx.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, s4, 4 +; GFX9-NEXT: s_lshl_b32 s2, s6, 4 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 -; GFX9-NEXT: s_andn2_b32 s3, s5, s2 +; GFX9-NEXT: s_andn2_b32 s3, s7, s2 ; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1342,10 +1361,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: s_insertelement_v2i16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s4, s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1361,10 +1380,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: s_insertelement_v2i16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s4, s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1381,8 +1400,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; GFX11-LABEL: s_insertelement_v2i16_dynamic: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -1409,13 +1428,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 { ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_lshl_b32 s2, s6, 4 +; GFX9-NEXT: s_lshl_b32 s2, s4, 4 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 @@ -1424,8 +1443,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1445,8 +1464,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1466,13 +1485,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, s0, 0x3e703e7, v1 @@ -1493,11 +1514,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 { ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1511,8 +1532,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1538,8 +1559,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -1565,8 +1586,10 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1597,13 +1620,13 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s2, v3, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1611,8 +1634,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1630,8 +1653,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1650,9 +1673,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1676,21 +1702,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, s6, v0, v3 +; GFX9-NEXT: v_perm_b32 v0, s4, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1708,8 +1734,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1728,9 +1754,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1754,13 +1783,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1768,8 +1797,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4f16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1787,8 +1816,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1807,9 +1836,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1833,21 +1865,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, s6, v1, v3 +; GFX9-NEXT: v_perm_b32 v1, s4, v1, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1865,8 +1897,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1885,9 +1917,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1911,13 +1946,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1925,8 +1960,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4i16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1944,8 +1979,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4i16_2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1964,9 +1999,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4i16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1991,11 +2029,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: global_load_dword v2, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff @@ -2010,11 +2048,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -2037,11 +2075,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: flat_load_dword v4, v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -2064,20 +2102,22 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 ; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfi_b32 v1, v3, s0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfi_b32 v0, v2, s0, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 @@ -2099,13 +2139,13 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 { ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: s_lshl_b32 s2, s7, 4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX9-NEXT: s_lshl_b32 s2, s5, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 ; GFX9-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v4, s4 @@ -2117,8 +2157,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2143,8 +2183,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2169,9 +2209,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_lshl_b32 s1, s1, 4 @@ -2199,21 +2242,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v8f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, s6, v1, v5 +; GFX9-NEXT: v_perm_b32 v1, s4, v1, v5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v8f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2232,8 +2275,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v8f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2252,9 +2295,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v8f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2278,13 +2324,13 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v8i16_6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v3, s2, v5, v3 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] @@ -2292,8 +2338,8 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v8i16_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2312,8 +2358,8 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v8i16_6: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2332,9 +2378,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v8i16_6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2358,40 +2407,40 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v8f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX9-NEXT: s_cmp_eq_u32 s7, 6 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: s_cmp_eq_u32 s5, 6 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_cmp_eq_u32 s5, 7 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 4 +; GFX9-NEXT: s_cmp_eq_u32 s5, 4 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 5 +; GFX9-NEXT: s_cmp_eq_u32 s5, 5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 2 +; GFX9-NEXT: s_cmp_eq_u32 s5, 2 ; GFX9-NEXT: v_perm_b32 v3, v3, v6, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 3 +; GFX9-NEXT: s_cmp_eq_u32 s5, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: s_cmp_eq_u32 s5, 0 ; GFX9-NEXT: v_perm_b32 v2, v6, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 1 +; GFX9-NEXT: s_cmp_eq_u32 s5, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2403,8 +2452,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v8f16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2457,8 +2506,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v8f16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2527,9 +2576,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v8f16_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_cmp_eq_u32 s1, 6 @@ -2585,15 +2637,15 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v16f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_perm_b32 v1, s6, v1, v9 +; GFX9-NEXT: v_perm_b32 v1, s4, v1, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -2601,8 +2653,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v16f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2628,8 +2680,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; CI-LABEL: v_insertelement_v16f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s3 @@ -2655,9 +2707,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_insertelement_v16f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] @@ -2686,14 +2741,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v16i16_6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v9, s4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfi_b32 v3, s2, v9, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2703,8 +2758,8 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v16i16_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2729,8 +2784,8 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; CI-LABEL: v_insertelement_v16i16_6: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2756,9 +2811,12 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_insertelement_v16i16_6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] @@ -2787,74 +2845,74 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v16f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 -; GFX9-NEXT: s_cmp_eq_u32 s7, 6 -; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: s_cmp_eq_u32 s5, 6 +; GFX9-NEXT: v_mov_b32_e32 v9, s4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_cmp_eq_u32 s5, 7 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 4 +; GFX9-NEXT: s_cmp_eq_u32 s5, 4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 5 +; GFX9-NEXT: s_cmp_eq_u32 s5, 5 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 2 +; GFX9-NEXT: s_cmp_eq_u32 s5, 2 ; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 3 +; GFX9-NEXT: s_cmp_eq_u32 s5, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: s_cmp_eq_u32 s5, 0 ; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 1 +; GFX9-NEXT: s_cmp_eq_u32 s5, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 14 +; GFX9-NEXT: s_cmp_eq_u32 s5, 14 ; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 15 +; GFX9-NEXT: s_cmp_eq_u32 s5, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 12 +; GFX9-NEXT: s_cmp_eq_u32 s5, 12 ; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 13 +; GFX9-NEXT: s_cmp_eq_u32 s5, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 10 +; GFX9-NEXT: s_cmp_eq_u32 s5, 10 ; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 11 +; GFX9-NEXT: s_cmp_eq_u32 s5, 11 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 8 +; GFX9-NEXT: s_cmp_eq_u32 s5, 8 ; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 9 +; GFX9-NEXT: s_cmp_eq_u32 s5, 9 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2867,8 +2925,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2964,8 +3022,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v16f16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3094,9 +3152,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v16f16_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index df03e89370377..aca4730122f90 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -436,7 +436,7 @@ entry: define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -466,7 +466,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX90A-LABEL: udiv_i32: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -496,7 +496,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX10-LABEL: udiv_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX10-NEXT: s_sub_i32 s5, 0, s3 @@ -526,7 +526,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-FLATSCR-LABEL: udiv_i32: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -556,7 +556,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX11-LABEL: udiv_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX11-NEXT: s_sub_i32 s5, 0, s3 @@ -593,7 +593,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX12-LABEL: udiv_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cvt_f32_u32 s4, s3 ; GFX12-NEXT: s_sub_co_i32 s5, 0, s3 @@ -692,19 +692,19 @@ main_body: define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-LABEL: atomic_add_local: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX9-NEXT: s_mul_i32 s1, s1, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_mul_i32 s0, s0, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB5_2: @@ -712,19 +712,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX90A-LABEL: atomic_add_local: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB5_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX90A-NEXT: s_mul_i32 s1, s1, 5 -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_mul_i32 s0, s0, 5 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 ; GFX90A-NEXT: ds_add_u32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB5_2: @@ -732,18 +732,18 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX10-LABEL: atomic_add_local: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, exec_lo -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s1, s2 -; GFX10-NEXT: s_mul_i32 s1, s1, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX10-NEXT: s_mul_i32 s0, s0, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: ds_add_u32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv @@ -752,19 +752,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX9-FLATSCR-LABEL: atomic_add_local: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec -; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], exec +; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s1, 5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, 5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB5_2: @@ -772,19 +772,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX11-LABEL: atomic_add_local: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_mov_b32 s3, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB5_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s1, s2 +; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s1, s1, 5 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: s_mul_i32 s0, s0, 5 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 ; GFX11-NEXT: ds_add_u32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -793,19 +793,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX12-LABEL: atomic_add_local: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_mov_b32 s3, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB5_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s1, s2 +; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s1, s1, 5 -; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: s_mul_i32 s0, s0, 5 +; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 ; GFX12-NEXT: ds_add_u32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -894,10 +894,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -906,8 +906,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -923,10 +923,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB7_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX90A-NEXT: s_mul_i32 s4, s4, 5 @@ -935,8 +935,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB7_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -947,26 +947,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: atomic_add_ret_local: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB7_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX10-NEXT: s_mul_i32 s3, s3, 5 -; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10-NEXT: s_mul_i32 s1, s1, 5 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -982,10 +982,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-FLATSCR-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 @@ -994,8 +994,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB7_2: -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1006,26 +1006,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: atomic_add_ret_local: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s3, exec_lo -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB7_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s3, s3, 5 -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_mul_i32 s1, s1, 5 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB7_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1037,26 +1037,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: atomic_add_ret_local: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s3, exec_lo -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB7_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s3, s3, 5 -; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: s_mul_i32 s1, s1, 5 +; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB7_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -1083,10 +1083,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -1094,8 +1094,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1111,10 +1111,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB8_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX90A-NEXT: s_mul_i32 s4, s4, 5 @@ -1122,8 +1122,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB8_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1134,24 +1134,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: add_i32_constant: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB8_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX10-NEXT: s_mul_i32 s3, s3, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10-NEXT: s_mul_i32 s1, s1, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1167,10 +1167,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 @@ -1178,8 +1178,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB8_2: -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1190,25 +1190,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: add_i32_constant: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s3, exec_lo -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s3, s3, 5 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_mul_i32 s1, s1, 5 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1220,25 +1220,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: add_i32_constant: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b32 s3, exec_lo -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB8_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s3, s3, 5 -; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: s_mul_i32 s1, s1, 5 +; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: .LBB8_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index 6c8646968b676..b49931379b84a 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -59,10 +59,10 @@ define void @func_regular_call() #1 { ; GCN-LABEL: {{^}}func_tail_call: ; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, -; GCN-NEXT: s_addc_u32 s5, -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, +; GCN-NEXT: s_addc_u32 s17, +; GCN-NEXT: s_setpc_b64 s[16:17] ; GCN: ; NumSgprs: 32 ; GCN: ; NumVgprs: 8 diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll index 2370ceff89bd5..496a1c652da25 100644 --- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll @@ -7,11 +7,11 @@ declare void @llvm.trap() #0 ; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_kernarg_size 8 -; DOORBELL-NEXT: .amdhsa_user_sgpr_count 6 +; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12 ; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 ; DOORBELL: .end_amdhsa_kernel -define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { +define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #0 { store volatile i32 1, ptr addrspace(1) %arg0 call void @llvm.trap() unreachable @@ -19,5 +19,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ret void } +attributes #0 = { "amdgpu-no-implicitarg-ptr" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 69f181fcede30..f9073be7e260b 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -8,11 +8,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { ; SI-LABEL: i8_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 0xff +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -20,10 +20,10 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; VI-LABEL: i8_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_and_b32 s2, s4, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,8 +32,8 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; GFX9-LABEL: i8_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -80,11 +80,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { ; SI-LABEL: i8_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 0xff +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -92,10 +92,10 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; VI-LABEL: i8_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_and_b32 s2, s4, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -104,8 +104,8 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; GFX9-LABEL: i8_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -155,11 +155,11 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { ; SI-LABEL: i8_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i8 s4, s2 +; SI-NEXT: s_sext_i32_i8 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -167,10 +167,10 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; VI-LABEL: i8_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i8 s2, s2 +; VI-NEXT: s_sext_i32_i8 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -179,8 +179,8 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; GFX9-LABEL: i8_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -230,11 +230,11 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { ; SI-LABEL: i16_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 0xffff +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -242,10 +242,10 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; VI-LABEL: i16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_and_b32 s2, s4, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -254,8 +254,8 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; GFX9-LABEL: i16_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -302,11 +302,11 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { ; SI-LABEL: i16_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 0xffff +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -314,10 +314,10 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; VI-LABEL: i16_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_and_b32 s2, s4, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -326,8 +326,8 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; GFX9-LABEL: i16_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -377,11 +377,11 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { ; SI-LABEL: i16_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s4, s2 +; SI-NEXT: s_sext_i32_i16 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -389,10 +389,10 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; VI-LABEL: i16_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_sext_i32_i16 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -401,8 +401,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; GFX9-LABEL: i16_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -452,8 +452,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { ; SI-LABEL: i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,19 +463,19 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou ; ; VI-LABEL: i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -511,8 +511,8 @@ entry: define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { ; SI-LABEL: f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -522,19 +522,19 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n ; ; VI-LABEL: f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -570,8 +570,8 @@ entry: define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; SI-LABEL: v2i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -581,19 +581,19 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; ; VI-LABEL: v2i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -659,8 +659,8 @@ entry: define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; SI-LABEL: v2i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -670,19 +670,19 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; ; VI-LABEL: v2i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -718,7 +718,7 @@ entry: define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { ; SI-LABEL: v2i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,7 +731,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; VI-LABEL: v2i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -742,7 +742,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; GFX9-LABEL: v2i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -781,7 +781,7 @@ entry: define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { ; SI-LABEL: v2f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; VI-LABEL: v2f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -805,7 +805,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; GFX9-LABEL: v2f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -844,8 +844,8 @@ entry: define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { ; SI-LABEL: v3i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 @@ -858,26 +858,26 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; ; VI-LABEL: v3i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_lshr_b32 s2, s4, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 2 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: flat_store_byte v[2:3], v5 ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -983,7 +983,7 @@ entry: define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { ; SI-LABEL: v3i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -998,7 +998,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; VI-LABEL: v3i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1014,7 +1014,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; GFX9-LABEL: v3i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1102,8 +1102,8 @@ entry: define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { ; SI-LABEL: v3i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,8 +1117,8 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; VI-LABEL: v3i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1130,14 +1130,14 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; GFX9-LABEL: v3i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3i32_arg: @@ -1181,8 +1181,8 @@ entry: define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { ; SI-LABEL: v3f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,8 +1196,8 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; VI-LABEL: v3f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1209,14 +1209,14 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; GFX9-LABEL: v3f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3f32_arg: @@ -1260,8 +1260,8 @@ entry: define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; SI-LABEL: v4i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1271,19 +1271,19 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; ; VI-LABEL: v4i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1319,7 +1319,7 @@ entry: define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; SI-LABEL: v4i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; VI-LABEL: v4i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -1343,7 +1343,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; GFX9-LABEL: v4i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1382,8 +1382,8 @@ entry: define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { ; SI-LABEL: v4i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; VI-LABEL: v4i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1410,15 +1410,15 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; GFX9-LABEL: v4i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4i32_arg: @@ -1456,8 +1456,8 @@ entry: define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { ; SI-LABEL: v4f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1470,8 +1470,8 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; VI-LABEL: v4f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1484,15 +1484,15 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; GFX9-LABEL: v4f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4f32_arg: @@ -1530,7 +1530,7 @@ entry: define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind { ; SI-LABEL: v5i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; VI-LABEL: v5i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; GFX9-LABEL: v5i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1671,50 +1671,50 @@ entry: define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind { ; SI-LABEL: v5i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s6, s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:8 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s5, s[0:1], 0x3c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s5, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 8 +; VI-NEXT: s_add_u32 s4, s0, 8 ; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_short v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_short v2, v3, s[6:7] offset:8 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_short v2, v3, s[4:5] offset:8 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i16_arg: @@ -1902,27 +1902,27 @@ entry: define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind { ; SI-LABEL: v5i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[0:1], 0x15 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s8, s[2:3], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s7, s[0:1], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s7, s[2:3], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1941,9 +1941,9 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; ; GFX9-LABEL: v5i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s8, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, s8 @@ -1951,8 +1951,8 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dword v4, v5, s[6:7] offset:16 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dword v4, v5, s[4:5] offset:16 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i32_arg: @@ -2000,27 +2000,27 @@ entry: define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind { ; SI-LABEL: v5f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[0:1], 0x15 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s8, s[2:3], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s7, s[0:1], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dword s7, s[2:3], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -2039,19 +2039,19 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float ; ; GFX9-LABEL: v5f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: global_store_dword v4, v0, s[6:7] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: global_store_dword v4, v0, s[4:5] offset:16 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5f32_arg: @@ -2099,34 +2099,34 @@ entry: define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind { ; SI-LABEL: v5i64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2155,9 +2155,9 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> ; ; GFX9-LABEL: v5i64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -2241,34 +2241,34 @@ entry: define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind { ; SI-LABEL: v5f64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2297,9 +2297,9 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; ; GFX9-LABEL: v5f64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -2384,7 +2384,7 @@ entry: define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; SI-LABEL: v8i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2397,7 +2397,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; VI-LABEL: v8i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -2408,7 +2408,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; GFX9-LABEL: v8i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2635,8 +2635,8 @@ entry: define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; SI-LABEL: v8i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,8 +2649,8 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; VI-LABEL: v8i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2663,15 +2663,15 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; GFX9-LABEL: v8i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v8i16_arg: @@ -2883,8 +2883,8 @@ entry: define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { ; SI-LABEL: v8i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2903,8 +2903,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; VI-LABEL: v8i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -2926,8 +2926,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; GFX9-LABEL: v8i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -2994,8 +2994,8 @@ entry: define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { ; SI-LABEL: v8f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3014,8 +3014,8 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; VI-LABEL: v8f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -3037,8 +3037,8 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; GFX9-LABEL: v8f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -3106,8 +3106,8 @@ entry: define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; SI-LABEL: v16i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3120,8 +3120,8 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; VI-LABEL: v16i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3134,15 +3134,15 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; GFX9-LABEL: v16i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v16i8_arg: @@ -3556,8 +3556,8 @@ entry: define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; SI-LABEL: v16i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3576,8 +3576,8 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; VI-LABEL: v16i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -3599,8 +3599,8 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; GFX9-LABEL: v16i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -4012,8 +4012,8 @@ entry: define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { ; SI-LABEL: v16i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4044,8 +4044,8 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; VI-LABEL: v16i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_add_u32 s2, s0, 48 @@ -4085,8 +4085,8 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; GFX9-LABEL: v16i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 @@ -4200,8 +4200,8 @@ entry: define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { ; SI-LABEL: v16f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4232,8 +4232,8 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; VI-LABEL: v16f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_add_u32 s2, s0, 48 @@ -4273,8 +4273,8 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; GFX9-LABEL: v16f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 @@ -4388,7 +4388,7 @@ entry: define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: kernel_arg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4401,7 +4401,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; VI-LABEL: kernel_arg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4412,7 +4412,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; GFX9-LABEL: kernel_arg_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4450,7 +4450,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; SI-LABEL: f64_kernel_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4463,7 +4463,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: f64_kernel_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4474,7 +4474,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; GFX9-LABEL: f64_kernel_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4522,8 +4522,8 @@ entry: define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind { ; SI-LABEL: i65_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s8, s4, 1 @@ -4539,8 +4539,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; VI-LABEL: i65_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -4558,11 +4558,11 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; GFX9-LABEL: i65_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s6, 1 +; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -4640,11 +4640,11 @@ entry: define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 1 +; SI-NEXT: s_and_b32 s4, s4, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -4652,10 +4652,10 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; VI-LABEL: i1_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: s_and_b32 s2, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4664,8 +4664,8 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; GFX9-LABEL: i1_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4731,11 +4731,11 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s2, 1 +; SI-NEXT: s_and_b32 s4, s4, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4743,10 +4743,10 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: s_and_b32 s2, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4755,8 +4755,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4803,8 +4803,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4816,11 +4816,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: s_and_b32 s2, s4, 1 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -4829,8 +4829,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4879,11 +4879,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s4, s2, 0x10000 +; SI-NEXT: s_bfe_i32 s4, s4, 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4891,10 +4891,10 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s2, s2, 0x10000 +; VI-NEXT: s_bfe_i32 s2, s4, 0x10000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4903,8 +4903,8 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_sext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 @@ -4953,11 +4953,11 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 +; SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -4966,21 +4966,21 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i1_arg_sext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 @@ -5062,10 +5062,10 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { ; SI-LABEL: struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dword s9, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x11 +; SI-NEXT: s_load_dword s8, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dword s9, s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x11 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -5089,46 +5089,46 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; ; VI-LABEL: struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s5, s[0:1], 0x3c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; VI-NEXT: s_load_dword s4, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s5, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x44 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: struct_argument_alignment: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s5, s[6:7], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -5196,6 +5196,7 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; SI-LABEL: packed_struct_argument_alignment: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dword s6, s[0:1], 0x9 @@ -5229,37 +5230,37 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; ; VI-LABEL: packed_struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s2, s0, 49 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: s_add_u32 s4, s0, 50 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_add_u32 s2, s2, 3 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 51 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 49 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s4, s2, 50 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s0, 3 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s2, 51 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: s_add_u32 s2, s0, 53 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s0, s2, 53 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28 +; VI-NEXT: s_load_dword s4, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v7, s4 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dword v[2:3], v7 @@ -5280,10 +5281,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; GFX9-LABEL: packed_struct_argument_alignment: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 +; GFX9-NEXT: global_load_dword v6, v2, s[6:7] offset:13 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] offset:17 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5379,11 +5380,11 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { ; SI-LABEL: struct_argument_alignment_after: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s12, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_load_dword s13, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 +; SI-NEXT: s_load_dword s12, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb +; SI-NEXT: s_load_dword s13, s[2:3], 0xf +; SI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x15 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -5413,11 +5414,11 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; VI-LABEL: struct_argument_alignment_after: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dword s9, s[0:1], 0x3c -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; VI-NEXT: s_load_dword s8, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dword s9, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x54 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5445,19 +5446,19 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; GFX9-LABEL: struct_argument_alignment_after: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s10, s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s11, s[4:5], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 @@ -5545,7 +5546,7 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; SI-LABEL: array_3xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5565,7 +5566,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; VI-LABEL: array_3xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5583,7 +5584,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; GFX9-LABEL: array_3xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -5659,7 +5660,8 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; SI-LABEL: array_3xi16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_mov_b64 s[0:1], s[2:3] +; SI-NEXT: s_load_dword s4, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:42 @@ -5679,22 +5681,22 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; ; VI-LABEL: array_3xi16: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s2, s0, 38 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s0, 42 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_add_u32 s0, s2, 38 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s4, s0, 2 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s0, s2, 42 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_ushort v4, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5711,10 +5713,10 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; GFX9-LABEL: array_3xi16: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 -; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:2 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 +; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] offset:2 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -5829,6 +5831,7 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; SI-LABEL: small_array_round_down_offset: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:37 @@ -5839,8 +5842,8 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; ; VI-LABEL: small_array_round_down_offset: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, 37 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 37 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ubyte v0, v[0:1] @@ -5852,7 +5855,7 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; GFX9-LABEL: small_array_round_down_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] offset:1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5886,8 +5889,8 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_align_constant_i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x49 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x49 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5901,13 +5904,13 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; VI-LABEL: byref_align_constant_i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x124 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -5916,8 +5919,8 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; GFX9-LABEL: byref_align_constant_i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x100 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -5970,83 +5973,83 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_natural_align_constant_v16i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0x29 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s20, s[2:3], 0x29 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: byref_natural_align_constant_v16i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s20, s[0:1], 0xa4 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s20, s[2:3], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_add_u32 s0, s2, 48 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s2, 32 +; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s20 @@ -6056,9 +6059,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; ; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll index 1a73df341108f..f74f9a8f2bdd8 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -10,28 +10,28 @@ ; GCN: s_and_b32 ; HSA-VI: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) #0 { store i1 %x, ptr addrspace(1) %out, align 1 ret void } ; FUNC-LABEL: {{^}}v3i8_arg: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x8 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 ; HSA-VI: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { +define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) #0 { entry: store <3 x i8> %in, ptr addrspace(1) %out, align 4 ret void } ; FUNC-LABEL: {{^}}i65_arg: -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 ; HSA-VI: .amdhsa_kernarg_size 24 -define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind { +define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) #0 { entry: store i65 %in, ptr addrspace(1) %out, align 4 ret void @@ -39,7 +39,7 @@ entry: ; FUNC-LABEL: {{^}}empty_struct_arg: ; HSA-VI: .amdhsa_kernarg_size 0 -define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { +define amdgpu_kernel void @empty_struct_arg({} %in) #0 { ret void } @@ -54,13 +54,13 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { ; FIXME: Total argument size is computed wrong ; FUNC-LABEL: {{^}}struct_argument_alignment: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x20 ; HSA-VI: .amdhsa_kernarg_size 40 -define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { +define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) #0 { %val0 = extractvalue {i32, i64} %arg0, 0 %val1 = extractvalue {i32, i64} %arg0, 1 %val2 = extractvalue {i32, i64} %arg1, 0 @@ -78,11 +78,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; HSA-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; HSA-VI: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13 ; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x4 ; HSA-VI: .amdhsa_kernarg_size 28 -define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { +define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) #0 { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 %val2 = extractvalue <{i32, i64}> %arg1, 0 @@ -95,14 +95,14 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, } ; GCN-LABEL: {{^}}struct_argument_alignment_after: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x20 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x30 ; HSA-VI: .amdhsa_kernarg_size 64 -define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { +define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) #0 { %val0 = extractvalue {i32, i64} %arg0, 0 %val1 = extractvalue {i32, i64} %arg0, 1 %val2 = extractvalue {i32, i64} %arg2, 0 @@ -116,7 +116,7 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, } ; GCN-LABEL: {{^}}array_3xi32: -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { store volatile i16 %arg0, ptr addrspace(1) undef store volatile [3 x i32] %arg1, ptr addrspace(1) undef @@ -124,7 +124,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { } ; GCN-LABEL: {{^}}array_3xi16: -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { store volatile i8 %arg0, ptr addrspace(1) undef store volatile [3 x i16] %arg1, ptr addrspace(1) undef @@ -135,7 +135,7 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; GCN: s_load_dword [[DWORD:s[0-9]+]] ; GCN-DAG: s_bfe_u32 [[BFE:s[0-9]+]], [[DWORD]], 0x100010{{$}} ; GCN-DAG: s_and_b32 [[AND:s[0-9]+]], [[DWORD]], 0x7fff{{$}} -define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15> %in) { +define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15> %in) #0 { entry: store <2 x i15> %in, ptr addrspace(1) %out, align 4 ret void @@ -147,7 +147,7 @@ entry: ; GCN: s_and_b32 ; GCN: s_and_b32 ; GCN: s_or_b32 -define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15> %in) { +define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15> %in) #0 { entry: store <3 x i15> %in, ptr addrspace(1) %out, align 4 ret void @@ -156,9 +156,9 @@ entry: ; Byref pointers should only be treated as offsets from kernarg ; GCN-LABEL: {{^}}byref_constant_i8_arg: ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8 +; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[8:9] offset:8 ; GCN: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) { +define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) #0 { %in = load i8, ptr addrspace(4) %in.byref %ext = zext i8 %in to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -167,9 +167,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out ; GCN-LABEL: {{^}}byref_constant_i16_arg: ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8 +; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[8:9] offset:8 ; GCN: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) { +define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) #0 { %in = load i16, ptr addrspace(4) %in.byref %ext = zext i16 %in to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -177,9 +177,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou } ; GCN-LABEL: {{^}}byref_constant_i32_arg: -; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}} +; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[8:9], 0x0{{$}} ; GCN: .amdhsa_kernarg_size 16 -define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) { +define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) #0 { %in = load i32, ptr addrspace(4) %in.byref store volatile i32 %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -187,10 +187,10 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou } ; GCN-LABEL: {{^}}byref_constant_v4i32_arg: -; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10{{$}} -; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x20{{$}} +; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x10{{$}} +; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x20{{$}} ; GCN: .amdhsa_kernarg_size 36 -define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) { +define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) #0 { %in = load <4 x i32>, ptr addrspace(4) %in.byref store volatile <4 x i32> %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -198,13 +198,13 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % } ; GCN-LABEL: {{^}}byref_align_constant_i32_arg: -; GCN-DAG: s_load_dwordx2 s[[[IN:[0-9]+]]:[[AFTER_OFFSET:[0-9]+]]], s[4:5], 0x100{{$}} +; GCN-DAG: s_load_dwordx2 s[[[IN:[0-9]+]]:[[AFTER_OFFSET:[0-9]+]]], s[8:9], 0x100{{$}} ; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], s[[IN]] ; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], s[[AFTER_OFFSET]] ; GCN: global_store_dword v{{[0-9]+}}, [[V_IN]], s ; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s ; GCN: .amdhsa_kernarg_size 264 -define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { +define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { %in = load i32, ptr addrspace(4) %in.byref store volatile i32 %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -212,10 +212,10 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu } ; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg: -; GCN-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80 -; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}} +; GCN-DAG: s_load_dword s{{[0-9]+}}, s[8:9], 0x80 +; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x40{{$}} ; GCN: .amdhsa_kernarg_size 132 -define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) #0 { %in = load <16 x i32>, ptr addrspace(4) %in.byref store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -224,9 +224,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; Also accept byref kernel arguments with other global address spaces. ; GCN-LABEL: {{^}}byref_global_i32_arg: -; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} +; GCN: s_load_dword [[IN:s[0-9]+]], s[8:9], 0x8{{$}} ; GCN: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) { +define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) #0 { %in = load i32, ptr addrspace(1) %in.byref store i32 %in, ptr addrspace(1) %out, align 4 ret void @@ -234,17 +234,17 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ; GCN-LABEL: {{^}}byref_flat_i32_arg: ; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} -define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) { +define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) #0 { %in = load i32, ptr %in.byref store i32 %in, ptr addrspace(1) %out, align 4 ret void } ; GCN-LABEL: {{^}}byref_constant_32bit_i32_arg: -; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s4, 8 +; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s8, 8 ; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}} ; GCN: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}} -define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) { +define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) #0 { %in = load i32, ptr addrspace(6) %in.byref store i32 %in, ptr addrspace(1) %out, align 4 ret void @@ -257,9 +257,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu ; } ; GCN-LABEL: {{^}}multi_byref_constant_i32_arg: -; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 ; GCN: .amdhsa_kernarg_size 20 -define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) { +define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) #0 { %in0 = load i32, ptr addrspace(4) %in0.byref %in1 = load i32, ptr addrspace(4) %in1.byref store volatile i32 %in0, ptr addrspace(1) %out, align 4 @@ -271,13 +271,15 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu ; GCN-LABEL: {{^}}byref_constant_i32_arg_offset0: ; GCN-NOT: s4 ; GCN-NOT: s5 -; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x0{{$}} +; GCN: s_load_dword {{s[0-9]+}}, s[8:9], 0x0{{$}} ; GCN: .amdhsa_kernarg_size 4 -define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) { +define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) #0 { %in = load i32, ptr addrspace(4) %in.byref store i32 %in, ptr addrspace(1) undef, align 4 ret void } +attributes #0 = { "amdgpu-no-implicitarg-ptr" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll index 3e0ad65c49821..0a70734a65c20 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -114,9 +114,9 @@ define amdgpu_ps void @only_kill() #0 { ; CHECK-NEXT: ; %bb.3: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB2_4: -; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: exp null off, off, off, off done vm -; CHECK-NEXT: s_endpgm +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null off, off, off, off done vm +; CHECK-NEXT: s_endpgm main_body: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index cb6073e9341e0..7698372b68779 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -26,20 +26,20 @@ ; GCNHSA: .amdhsa_group_segment_fixed_size 0 ; GCNHSA: .amdhsa_private_segment_fixed_size 32772 ; GCNHSA: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCNHSA: .amdhsa_user_sgpr_dispatch_ptr 0 -; GCNHSA: .amdhsa_user_sgpr_queue_ptr 0 +; GCNHSA: .amdhsa_user_sgpr_dispatch_ptr 1 +; GCNHSA: .amdhsa_user_sgpr_queue_ptr 1 ; GCNHSA: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GCNHSA: .amdhsa_user_sgpr_dispatch_id 0 +; GCNHSA: .amdhsa_user_sgpr_dispatch_id 1 ; GCNHSA: .amdhsa_user_sgpr_flat_scratch_init 1 ; GCNHSA: .amdhsa_user_sgpr_private_segment_size 0 ; GCNHSA: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GCNHSA: .amdhsa_system_sgpr_workgroup_id_x 1 -; GCNHSA: .amdhsa_system_sgpr_workgroup_id_y 0 -; GCNHSA: .amdhsa_system_sgpr_workgroup_id_z 0 +; GCNHSA: .amdhsa_system_sgpr_workgroup_id_y 1 +; GCNHSA: .amdhsa_system_sgpr_workgroup_id_z 1 ; GCNHSA: .amdhsa_system_sgpr_workgroup_info 0 -; GCNHSA: .amdhsa_system_vgpr_workitem_id 0 +; GCNHSA: .amdhsa_system_vgpr_workitem_id 2 ; GCNHSA: .amdhsa_next_free_vgpr 3 -; GCNHSA: .amdhsa_next_free_sgpr 10 +; GCNHSA: .amdhsa_next_free_sgpr 18 ; GCNHSA: .amdhsa_float_round_mode_32 0 ; GCNHSA: .amdhsa_float_round_mode_16_64 0 ; GCNHSA: .amdhsa_float_denorm_mode_32 3 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 9619cb73b1538..266ab687cd8d5 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -93,7 +93,7 @@ define void @use_extern_overalign() #0 { define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -113,23 +113,27 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 +; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] -; CHECK-NEXT: s_lshl_b32 s4, s12, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_lshl_b32 s4, s15, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 4 @@ -152,7 +156,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -172,23 +176,27 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 +; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] -; CHECK-NEXT: s_lshl_b32 s4, s12, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_lshl_b32 s4, s15, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -211,7 +219,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -231,23 +239,27 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 +; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] -; CHECK-NEXT: s_lshl_b32 s4, s12, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_lshl_b32 s4, s15, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -270,7 +282,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -290,23 +302,27 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 +; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] -; CHECK-NEXT: s_lshl_b32 s4, s12, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_lshl_b32 s4, s15, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -336,25 +352,29 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, 2 ; CHECK-NEXT: s_mov_b32 s15, 0 -; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: ds_write_b16 v3, v4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_normal @@ -365,33 +385,37 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s15, 4 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 -; CHECK-NEXT: s_mov_b32 s15, 4 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -405,25 +429,29 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, 2 ; CHECK-NEXT: s_mov_b32 s15, 2 -; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: ds_write_b16 v3, v4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_overalign @@ -434,33 +462,37 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s15, 6 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 -; CHECK-NEXT: s_mov_b32 s15, 6 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -474,25 +506,29 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, 2 ; CHECK-NEXT: s_mov_b32 s15, 1 -; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: ds_write_b16 v3, v4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_normal @@ -503,33 +539,37 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s15, 5 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 -; CHECK-NEXT: s_mov_b32 s15, 5 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -543,25 +583,29 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, 2 ; CHECK-NEXT: s_mov_b32 s15, 3 -; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: ds_write_b16 v3, v4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_overalign @@ -572,33 +616,37 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s6, s9 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s7, s7, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s15, 7 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 -; CHECK-NEXT: s_mov_b32 s15, 7 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll index e1124f3ba89b5..9899d20cf3ae6 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: not llc -mtriple=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s @@ -11,21 +11,21 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p) { ; GCN-LABEL: name: load_zeroinit_lds_global ; GCN: bb.0 (%ir-block.0): - ; GCN: liveins: $sgpr0_sgpr1 - ; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 - ; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 - ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 - ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 - ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3 + ; GCN: liveins: $sgpr2_sgpr3 + ; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX8: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec ; GCN: SI_INIT_M0 -1, implicit-def $m0 ; GCN: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 killed [[V_MOV_B32_e32_]], 40, 0, implicit $m0, implicit $exec - ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] - ; GFX6: BUFFER_STORE_DWORD_OFFSET killed [[DS_READ_B32_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec - ; GFX8: FLAT_STORE_DWORD killed [[COPY1]], killed [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX8: BUFFER_STORE_DWORD_OFFSET killed [[DS_READ_B32_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec + ; GFX9: FLAT_STORE_DWORD killed [[COPY1]], killed [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr ; GCN: S_ENDPGM 0 %gep = getelementptr [256 x i32], ptr addrspace(3) @lds, i32 0, i32 10 %ld = load i32, ptr addrspace(3) %gep diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 952e89edeb799..b61838c06a1f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -188,9 +188,6 @@ ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Lower OpenCL enqueued blocks ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions -; GCN-O1-NEXT: AMDGPU Attributor -; GCN-O1-NEXT: FunctionPass Manager -; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces ; GCN-O1-NEXT: Dominator Tree Construction @@ -465,9 +462,6 @@ ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions -; GCN-O1-OPTS-NEXT: AMDGPU Attributor -; GCN-O1-OPTS-NEXT: FunctionPass Manager -; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -772,9 +766,6 @@ ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Lower OpenCL enqueued blocks ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions -; GCN-O2-NEXT: AMDGPU Attributor -; GCN-O2-NEXT: FunctionPass Manager -; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces ; GCN-O2-NEXT: Dominator Tree Construction @@ -1083,9 +1074,6 @@ ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Lower OpenCL enqueued blocks ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions -; GCN-O3-NEXT: AMDGPU Attributor -; GCN-O3-NEXT: FunctionPass Manager -; GCN-O3-NEXT: Cycle Info Analysis ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll index 9b63a8a3efcf9..9445f1225e0cb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll @@ -9,7 +9,7 @@ define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -27,7 +27,7 @@ define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -44,7 +44,7 @@ define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: @@ -60,7 +60,7 @@ define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -79,7 +79,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -97,7 +97,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inre ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -114,7 +114,7 @@ define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 in ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -132,7 +132,7 @@ define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -149,7 +149,7 @@ define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: @@ -165,7 +165,7 @@ define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -184,7 +184,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %r ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -202,7 +202,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> i ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll index 5a15dc53a292c..61f0f20f05704 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll @@ -510,4 +510,4 @@ true: ret i32 42 false: ret i32 33 -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll index ca7385be5dee7..be270439ef57c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_i16_i32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll index b59e584418bd8..50561de5bdbd2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_u16_u32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll index 0093e30b03644..ce6336da4fd96 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll index d896090a47665..66b4f143c60d0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index 920ff8a927e2d..e1caf3bea6119 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 @@ -51,7 +51,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 @@ -67,8 +67,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -78,10 +78,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -89,33 +89,33 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s4, s4 -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2 +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -145,8 +145,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -165,8 +165,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -188,13 +188,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -203,13 +203,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -218,8 +218,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -246,7 +248,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -262,7 +264,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -279,7 +281,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -290,7 +292,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -301,7 +303,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -324,7 +328,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -340,7 +344,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -357,7 +361,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -368,7 +372,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -379,7 +383,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -402,8 +408,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -422,8 +428,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -445,13 +451,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -460,13 +466,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -475,8 +481,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -504,8 +512,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -524,8 +532,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -547,13 +555,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -562,13 +570,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -577,8 +585,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -606,8 +616,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -626,8 +636,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -649,13 +659,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -664,13 +674,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -679,8 +689,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -709,8 +721,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -729,8 +741,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -752,13 +764,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -767,13 +779,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -782,8 +794,10 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll index f8a1388c9415e..50f1beba25227 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll @@ -4,19 +4,54 @@ declare i64 @llvm.amdgcn.dispatch.id() #1 ; GCN-LABEL: {{^}}dispatch_id: +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s10 +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s11 +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] +; GCN: .amdhsa_user_sgpr_dispatch_id 1 +define amdgpu_kernel void @dispatch_id(ptr addrspace(1) %out) #0 { + %tmp0 = call i64 @llvm.amdgcn.dispatch.id() + store i64 %tmp0, ptr addrspace(1) %out + ret void +} +; GCN-LABEL: {{^}}dispatch_id_opt0: +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s8 +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s9 +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] +; GCN: .amdhsa_user_sgpr_dispatch_id 1 +define amdgpu_kernel void @dispatch_id_opt0(ptr addrspace(1) %out) #2 { + %tmp0 = call i64 @llvm.amdgcn.dispatch.id() + store i64 %tmp0, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}dispatch_id_opt1: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] ; GCN: .amdhsa_user_sgpr_dispatch_id 1 -define amdgpu_kernel void @dispatch_id(ptr addrspace(1) %out) #0 { +define amdgpu_kernel void @dispatch_id_opt1(ptr addrspace(1) %out) #3 { %tmp0 = call i64 @llvm.amdgcn.dispatch.id() store i64 %tmp0, ptr addrspace(1) %out ret void } +; GCN-LABEL: {{^}}dispatch_id_opt2: +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s4 +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s5 +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] +; GCN: .amdhsa_user_sgpr_dispatch_id 1 +define amdgpu_kernel void @dispatch_id_opt2() #4 { + %tmp0 = call i64 @llvm.amdgcn.dispatch.id() + store i64 %tmp0, ptr addrspace(1) null + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { "amdgpu-no-dispatch-ptr" } +attributes #3 = { "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } +attributes #4 = { "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-implicitarg-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll index 95e50da8a4709..dcbfef0acadca 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll @@ -17,3 +17,5 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { convergent inaccessiblememonly nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; MIR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll index 3b64a8707b55e..18c711d0b2aec 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll @@ -59,6 +59,7 @@ define amdgpu_kernel void @one_f32() #0 { define amdgpu_kernel void @id_i32() #0 { ; GFX11-LABEL: id_i32: ; GFX11: ; %bb.0: +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 m0, 0 ; GFX11-NEXT: exp pos0 v0, off, off, off done row_en ; GFX11-NEXT: s_endpgm @@ -70,7 +71,8 @@ define amdgpu_kernel void @id_i32() #0 { define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { ; GFX11-LABEL: id_arg_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 m0, s0 ; GFX11-NEXT: exp pos0 v0, off, off, off done row_en @@ -84,16 +86,19 @@ define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { define amdgpu_kernel void @id_row_i32() #0 { ; GFX11-SDAG-LABEL: id_row_i32: ; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: s_mov_b32 m0, s0 ; GFX11-SDAG-NEXT: exp pos0 v0, off, off, off done row_en ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: id_row_i32: ; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 m0, v0 ; GFX11-GISEL-NEXT: exp pos0 v1, off, off, off done row_en ; GFX11-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index 17b941c59fd3f..a26b84e17374a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -15,7 +15,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -27,7 +27,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -37,7 +37,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -50,7 +50,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -88,7 +88,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -126,7 +126,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -164,23 +164,23 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -191,13 +191,13 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -208,10 +208,10 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -222,23 +222,23 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -249,13 +249,13 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -266,10 +266,10 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -280,23 +280,23 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -307,13 +307,13 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -324,10 +324,10 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -338,23 +338,23 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -365,13 +365,13 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -382,10 +382,10 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -396,23 +396,23 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -423,13 +423,13 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -440,10 +440,10 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -454,23 +454,23 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -481,13 +481,13 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -498,10 +498,10 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -512,23 +512,23 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -539,13 +539,13 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -556,10 +556,10 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -570,23 +570,23 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -597,13 +597,13 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -614,10 +614,10 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -628,23 +628,23 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -655,13 +655,13 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -672,10 +672,10 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -686,23 +686,23 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -713,13 +713,13 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -730,10 +730,10 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -744,23 +744,23 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -771,13 +771,13 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -788,10 +788,10 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -802,23 +802,23 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -829,13 +829,13 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -846,10 +846,10 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -860,23 +860,23 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -887,13 +887,13 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -904,10 +904,10 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 +; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -918,23 +918,23 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 +; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -945,13 +945,13 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13) store i32 %result, ptr addrspace(1) %out @@ -961,7 +961,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -973,7 +973,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -983,7 +983,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -996,7 +996,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_one: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1023,7 +1023,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_one: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_one: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_one: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1083,7 +1083,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1111,7 +1111,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_olt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_olt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_olt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1196,7 +1196,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_olt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1211,7 +1211,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ole: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ole: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ole: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1246,7 +1246,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ole: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1273,7 +1273,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1283,7 +1283,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1296,7 +1296,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1311,7 +1311,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_o: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1323,7 +1323,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_o: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_o: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_o: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1361,7 +1361,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uo: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1373,7 +1373,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uo: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1383,7 +1383,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uo: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1396,7 +1396,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uo: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1411,7 +1411,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_une: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1423,7 +1423,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_une: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1433,7 +1433,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_une: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_une: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1473,7 +1473,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1523,7 +1523,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1533,7 +1533,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1546,7 +1546,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1611,7 +1611,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1623,7 +1623,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1646,7 +1646,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1663,12 +1663,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_nop 0 @@ -1678,26 +1678,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_nop 0 @@ -1707,14 +1707,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %temp = call half @llvm.fabs.f16(half %a) %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half %temp, i32 1) @@ -1727,12 +1727,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_nop 0 @@ -1742,26 +1742,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_nop 0 @@ -1771,14 +1771,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %temp = call half @llvm.fabs.f16(half %a) %src_input = call half @llvm.fabs.f16(half %src) @@ -1798,7 +1798,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1808,7 +1808,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1823,10 +1823,10 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1837,23 +1837,23 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1864,13 +1864,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -1882,10 +1882,10 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1896,23 +1896,23 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1923,13 +1923,13 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -1941,10 +1941,10 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1955,23 +1955,23 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1982,13 +1982,13 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -2000,10 +2000,10 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2014,23 +2014,23 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2041,13 +2041,13 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -2059,10 +2059,10 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2073,23 +2073,23 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2100,13 +2100,13 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -2118,10 +2118,10 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2132,23 +2132,23 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2159,13 +2159,13 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -2177,10 +2177,10 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2191,23 +2191,23 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2218,13 +2218,13 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -2236,10 +2236,10 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2250,23 +2250,23 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2277,13 +2277,13 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -2295,10 +2295,10 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2309,23 +2309,23 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2336,13 +2336,13 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -2354,10 +2354,10 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2368,23 +2368,23 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2395,13 +2395,13 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -2413,10 +2413,10 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2427,23 +2427,23 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2454,13 +2454,13 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -2471,10 +2471,10 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2485,23 +2485,23 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2512,13 +2512,13 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -2529,10 +2529,10 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2543,23 +2543,23 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2570,13 +2570,13 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -2587,10 +2587,10 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 +; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2601,23 +2601,23 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 +; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2628,13 +2628,13 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13) store i32 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index ce055d6527996..7e78d8b05d09f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -30,7 +30,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -74,7 +74,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |s3| @@ -88,7 +88,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -113,7 +113,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -137,7 +137,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -163,7 +163,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -178,11 +178,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -193,24 +193,24 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -220,11 +220,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -240,11 +240,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -255,24 +255,24 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -282,11 +282,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -302,11 +302,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -317,24 +317,24 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -344,11 +344,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -364,11 +364,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -379,24 +379,24 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -406,11 +406,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -426,11 +426,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -441,24 +441,24 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -468,11 +468,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -488,11 +488,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -503,24 +503,24 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -530,11 +530,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -550,11 +550,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -565,24 +565,24 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -592,11 +592,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -612,11 +612,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -627,24 +627,24 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -654,11 +654,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -674,11 +674,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -689,24 +689,24 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -716,11 +716,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -736,11 +736,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -751,24 +751,24 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -778,11 +778,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -798,11 +798,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -813,24 +813,24 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -840,11 +840,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -860,11 +860,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -875,24 +875,24 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -902,11 +902,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -922,11 +922,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -937,24 +937,24 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -964,11 +964,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -984,11 +984,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s2 +; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -999,24 +999,24 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1026,11 +1026,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1045,7 +1045,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oeq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1059,7 +1059,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1086,7 +1086,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1105,7 +1105,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_one: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1119,7 +1119,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1165,7 +1165,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ogt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1179,7 +1179,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1206,7 +1206,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1225,7 +1225,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1252,7 +1252,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_olt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1299,7 +1299,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1326,7 +1326,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1345,7 +1345,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ole: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1359,7 +1359,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1386,7 +1386,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ueq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1419,7 +1419,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1465,7 +1465,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_o: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1492,7 +1492,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1506,7 +1506,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1525,7 +1525,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1539,7 +1539,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1585,7 +1585,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_une: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1599,7 +1599,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1612,7 +1612,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1626,7 +1626,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,7 +1645,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1705,7 +1705,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1719,7 +1719,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1765,7 +1765,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1779,7 +1779,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1792,7 +1792,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,7 +1806,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1825,7 +1825,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1839,7 +1839,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1866,7 +1866,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1887,13 +1887,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |s3| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |s2| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1904,26 +1904,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; GFX9-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1933,12 +1933,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1956,13 +1956,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |s3| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |s2| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1973,26 +1973,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2002,12 +2002,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2028,7 +2028,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2042,7 +2042,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -2054,7 +2054,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -2070,11 +2070,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2085,24 +2085,24 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2112,11 +2112,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2133,11 +2133,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2148,24 +2148,24 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2175,11 +2175,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2196,11 +2196,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2211,24 +2211,24 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2238,11 +2238,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2259,11 +2259,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2274,24 +2274,24 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2301,11 +2301,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2322,11 +2322,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2337,24 +2337,24 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2364,11 +2364,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2385,11 +2385,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2400,24 +2400,24 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2427,11 +2427,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2448,11 +2448,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2463,24 +2463,24 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2490,11 +2490,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2511,11 +2511,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2526,24 +2526,24 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2553,11 +2553,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2574,11 +2574,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2589,24 +2589,24 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2616,11 +2616,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2637,11 +2637,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2652,24 +2652,24 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2679,11 +2679,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2700,11 +2700,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2715,24 +2715,24 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2742,11 +2742,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2762,11 +2762,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2777,24 +2777,24 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2804,11 +2804,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2824,11 +2824,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2839,24 +2839,24 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2866,11 +2866,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2886,11 +2886,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s2 +; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2901,24 +2901,24 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2928,11 +2928,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index ca06a57be19cc..78d5da8dda177 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -8,7 +8,7 @@ declare bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %a, <2 x bfloat> %b, bf define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] @@ -34,18 +34,17 @@ entry: } define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp( -; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: -; SDAG-GFX11: ; %bb.0: ; %entry -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2 -; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3 -; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1 -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0 -; SDAG-GFX11-NEXT: s_endpgm -; +; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: scratch_load_b32 v0, off, s2 +; GFX11-NEXT: scratch_load_u16 v1, off, s3 +; GFX11-NEXT: scratch_load_b32 v2, off, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11-NEXT: scratch_store_b16 off, v0, s0 +; GFX11-NEXT: s_endpgm ; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: ; GISEL-GFX11: ; %bb.0: ; %entry ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 @@ -95,3 +94,5 @@ entry: } declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SDAG-GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll index 99c3deaada8c6..1343f25ec275e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c) define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] @@ -35,7 +35,7 @@ entry: define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp: ; SDAG-GFX11: ; %bb.0: ; %entry -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2 ; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3 @@ -47,7 +47,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; ; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp: ; GISEL-GFX11: ; %bb.0: ; %entry -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s1 ; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index e51b1d2da2e41..8a8b0490e9480 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -7,7 +7,7 @@ declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, floa define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll index d318bc80e4976..e74485142fb6f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll @@ -107,4 +107,4 @@ declare float @llvm.amdgcn.fmul.legacy(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "denormal-fp-math"="preserve-sign" } +attributes #2 = { nounwind "denormal-fp-math"="preserve-sign" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll index 434fa1bf7b340..f631a0bfc28eb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll @@ -7,7 +7,7 @@ declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64) define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) { ; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -16,7 +16,7 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) ; ; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -31,11 +31,12 @@ entry: define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) { ; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -46,8 +47,8 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a ; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_rtn: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index f6197e0770213..291c249e4b738 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -8,7 +8,7 @@ declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 @@ -27,7 +27,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index a2dc3662fcc48..12742f4f7127b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -8,7 +8,7 @@ declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 @@ -27,7 +27,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 309fd99031155..9e3e393d82e22 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -22,10 +22,10 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -36,23 +36,23 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -63,13 +63,13 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -87,7 +87,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -97,7 +97,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -111,10 +111,10 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -125,23 +125,23 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -152,13 +152,13 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -169,10 +169,10 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -183,23 +183,23 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -210,13 +210,13 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -227,10 +227,10 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -241,23 +241,23 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -268,13 +268,13 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -285,10 +285,10 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -299,23 +299,23 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -326,13 +326,13 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -343,10 +343,10 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -357,23 +357,23 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -384,13 +384,13 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -401,10 +401,10 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i32_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -415,23 +415,23 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i32_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -442,13 +442,13 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i32_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -459,10 +459,10 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -473,23 +473,23 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -500,13 +500,13 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -517,10 +517,10 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -531,23 +531,23 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -558,13 +558,13 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -575,10 +575,10 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -589,23 +589,23 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -616,13 +616,13 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -632,7 +632,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_eq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -644,7 +644,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_eq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -654,7 +654,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_eq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -667,7 +667,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_eq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -682,7 +682,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_ne: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -694,7 +694,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_ne: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -704,7 +704,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_ne: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -717,7 +717,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_ne: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -732,7 +732,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -744,7 +744,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -754,7 +754,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -767,7 +767,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -782,7 +782,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -804,7 +804,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -817,7 +817,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -832,7 +832,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -844,7 +844,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -854,7 +854,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -867,7 +867,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -882,7 +882,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -894,7 +894,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -904,7 +904,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -932,7 +932,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sgt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -944,7 +944,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sgt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -954,7 +954,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sgt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -967,7 +967,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sgt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -982,7 +982,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -994,7 +994,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1017,7 +1017,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_slt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_slt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_slt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1067,7 +1067,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_slt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sle: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sle: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1104,7 +1104,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sle: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1117,7 +1117,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sle: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1133,10 +1133,10 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1147,23 +1147,23 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1174,13 +1174,13 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -1198,7 +1198,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1222,10 +1222,10 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1236,23 +1236,23 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1263,13 +1263,13 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -1280,10 +1280,10 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1294,23 +1294,23 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1321,13 +1321,13 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -1338,10 +1338,10 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1352,23 +1352,23 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1379,13 +1379,13 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -1396,10 +1396,10 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1410,23 +1410,23 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1437,13 +1437,13 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -1454,10 +1454,10 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1468,23 +1468,23 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1495,13 +1495,13 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -1512,10 +1512,10 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i16_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1526,23 +1526,23 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i16_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1553,13 +1553,13 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i16_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -1570,10 +1570,10 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1584,23 +1584,23 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1611,13 +1611,13 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -1628,10 +1628,10 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1642,23 +1642,23 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1669,13 +1669,13 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -1686,10 +1686,10 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 +; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1700,23 +1700,23 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 +; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1727,13 +1727,13 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 @@ -1759,7 +1759,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX10-LABEL: v_icmp_i1_ne0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_gt_u32 s2, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 5f979e0177f58..60e242bf5b0e8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -25,11 +25,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -40,11 +40,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -54,24 +54,24 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -98,7 +98,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -108,7 +108,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i32: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -131,11 +131,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -146,11 +146,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -160,24 +160,24 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -193,11 +193,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -208,11 +208,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -222,24 +222,24 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -255,11 +255,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -270,11 +270,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -284,24 +284,24 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -317,11 +317,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -332,11 +332,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -346,24 +346,24 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -379,11 +379,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -394,11 +394,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -408,24 +408,24 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -441,11 +441,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GFX11-LABEL: v_icmp_i32_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -456,11 +456,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i32_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -470,24 +470,24 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; GFX9-LABEL: v_icmp_i32_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -503,11 +503,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -518,11 +518,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -532,24 +532,24 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -565,11 +565,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -580,11 +580,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -594,24 +594,24 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -627,11 +627,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -642,11 +642,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -656,24 +656,24 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -688,7 +688,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_eq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3] @@ -702,7 +702,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,7 +716,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -729,7 +729,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -748,7 +748,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_ne: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3] @@ -762,7 +762,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -776,7 +776,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -789,7 +789,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -808,7 +808,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3] @@ -822,7 +822,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -836,7 +836,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3] @@ -882,7 +882,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -909,7 +909,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -928,7 +928,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3] @@ -942,7 +942,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -969,7 +969,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -988,7 +988,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3] @@ -1002,7 +1002,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sgt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1076,7 +1076,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1089,7 +1089,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1108,7 +1108,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3] @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1136,7 +1136,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_slt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,7 +1196,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1209,7 +1209,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1228,7 +1228,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sle: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3] @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1269,7 +1269,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1289,11 +1289,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1304,11 +1304,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1318,24 +1318,24 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1362,7 +1362,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i16: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -1395,11 +1395,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1410,11 +1410,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1424,24 +1424,24 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1457,11 +1457,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1472,11 +1472,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1486,24 +1486,24 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1519,11 +1519,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1534,11 +1534,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1548,24 +1548,24 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1581,11 +1581,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1596,11 +1596,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1610,24 +1610,24 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1643,11 +1643,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1658,11 +1658,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1672,24 +1672,24 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1705,11 +1705,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GFX11-LABEL: v_icmp_i16_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1720,11 +1720,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i16_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1734,24 +1734,24 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; GFX9-LABEL: v_icmp_i16_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1767,11 +1767,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1782,11 +1782,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1796,24 +1796,24 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1829,11 +1829,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1844,11 +1844,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1858,24 +1858,24 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1891,11 +1891,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s2 +; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1906,11 +1906,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 +; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1920,24 +1920,24 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 +; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1952,7 +1952,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 @@ -1970,7 +1970,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: v_icmp_i1_ne0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_gt_u32 s2, 1 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -1986,7 +1986,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX9-LABEL: v_icmp_i1_ne0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_gt_u32 s2, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index dba67a03c000e..3168e05b816be 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -14,8 +14,9 @@ entry: define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -151,11 +152,11 @@ entry: define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 -; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 @@ -177,6 +178,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 ; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll index 70eff49450153..f7f72ae31cc1d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -310,10 +310,10 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32> declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #2 declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2 -attributes #0 = { nounwind noinline } -attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" } +attributes #0 = { nounwind noinline "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } +attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } attributes #2 = { nounwind readnone speculatable } -attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" } +attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index f1a4fe0f090b1..2d01703c78d78 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -163,7 +163,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 @@ -189,7 +189,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 @@ -215,11 +215,13 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v2, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -228,8 +230,8 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -260,7 +262,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 @@ -283,7 +285,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 @@ -306,21 +308,22 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 -; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -349,9 +352,9 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -375,9 +378,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 @@ -401,15 +404,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 ; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 -; GFX11-NEXT: v_mov_b32_e32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -443,9 +447,9 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -466,9 +470,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -489,21 +493,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 ; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 ; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index bc10eb68d75cb..0076079ce17c7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}is_private_vgpr: ; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] -; CI-DAG: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-DAG: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CIT: v_cmp_eq_u32_e32 vcc, s4, v[[PTR_HI]] ; CIH: v_cmp_eq_u32_e32 vcc, s2, v[[PTR_HI]] @@ -26,10 +26,10 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; select and vcc branch. ; GCN-LABEL: {{^}}is_private_sgpr: -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x1{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x1{{$}} -; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x32{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} +; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x32{{$}} +; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} ; CI: s_cmp_eq_u32 [[APERTURE]], [[PTR_HI]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index aad4d924952ff..e24c47991fe3d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}is_local_vgpr: ; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] -; CI-DAG: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-DAG: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base ; GFX9: v_cmp_eq_u32_e32 vcc, s[[HI]], v[[PTR_HI]] @@ -26,10 +26,10 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; select and vcc branch. ; GCN-LABEL: {{^}}is_local_sgpr: -; CI-DAG: s_load_dword s0, s[4:5], 0x1 +; CI-DAG: s_load_dword s0, s[6:7], 0x1 -; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x33{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} +; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x33{{$}} +; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} ; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base ; GFX9: s_cmp_eq_u32 [[PTR_HI]], s[[HI]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index 8dba22312ac88..ee005eb6e9841 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -4,9 +4,9 @@ ; ALL-LABEL: {{^}}test: ; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1 -; CO-V4: s_load_dword s{{[0-9]+}}, s[4:5], 0xa +; CO-V4: s_load_dword s{{[0-9]+}}, s[8:9], 0xa -; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[4:5], 0xa ; HSA: .amdhsa_kernarg_size 8 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 1 @@ -23,7 +23,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out) #1 { ; OS-MESA3D: kernarg_segment_alignment = 4 ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15 -; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15 +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[4:5], 0x15 ; HSA: .amdhsa_kernarg_size 8 define amdgpu_kernel void @test_implicit(ptr addrspace(1) %out) #1 { %implicitarg.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -78,7 +78,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(ptr addrspace(1) %out, ; HSA: .amdhsa_kernarg_size 0 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -define amdgpu_kernel void @test_no_kernargs() #1 { +define amdgpu_kernel void @test_no_kernargs() #4 { %kernarg.segment.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() %gep = getelementptr i32, ptr addrspace(4) %kernarg.segment.ptr, i64 10 %value = load i32, ptr addrspace(4) %gep @@ -123,6 +123,7 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind "amdgpu-implicitarg-num-bytes"="0" } attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" } attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" } +attributes #4 = { nounwind "amdgpu-implicitarg-num-bytes"="0" "amdgpu-no-implicitarg-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index 61818dafd2b84..c201f84cac726 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -23,8 +23,8 @@ define void @function_lds_id(ptr addrspace(1) %out) { define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: kernel_lds_id: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GCN-NEXT: s_add_i32 s2, s6, 42 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s2, s10, 42 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -42,21 +42,27 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l ; GCN-LABEL: indirect_lds_id: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] +; GCN-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 s8, s6, 8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: s_addc_u32 s9, s7, 0 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, function_lds_id@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 21 -; GCN-NEXT: s_mov_b32 s12, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm call void @function_lds_id(ptr addrspace(1) %out) @@ -66,7 +72,7 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: doesnt_use_it: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 1ae22c3eec185..8e9a652ae8a8e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -15,20 +15,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -41,8 +41,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX12-LABEL: v_permlane16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -60,20 +60,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vss_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -86,8 +86,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float ; GFX12-LABEL: v_permlane16_b32_vss_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -105,36 +105,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -149,8 +149,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -165,8 +165,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -181,8 +181,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -202,36 +202,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -246,8 +246,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -262,8 +262,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -278,8 +278,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -299,22 +299,22 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vii_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vii_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -324,7 +324,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-LABEL: v_permlane16_b32_vii_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -342,22 +342,22 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vii_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vii_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -367,7 +367,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; ; GFX12-LABEL: v_permlane16_b32_vii_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -384,7 +384,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -396,7 +396,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -422,7 +422,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -436,7 +436,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -450,7 +450,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -469,7 +469,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -481,7 +481,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -493,7 +493,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -521,7 +521,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -535,7 +535,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -556,25 +556,25 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vll_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vll_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -583,7 +583,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-LABEL: v_permlane16_b32_vll_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -601,7 +601,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -614,7 +614,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -627,7 +627,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -643,7 +643,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -659,7 +659,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -675,7 +675,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -697,25 +697,25 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; GFX10-LABEL: v_permlane16_b32_vll_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vll_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -724,7 +724,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; ; GFX12-LABEL: v_permlane16_b32_vll_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -742,7 +742,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -755,7 +755,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -768,7 +768,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -784,7 +784,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -800,7 +800,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -816,7 +816,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -838,33 +838,33 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vvv_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -873,17 +873,17 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -891,7 +891,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -909,7 +909,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -933,7 +933,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -947,7 +947,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -961,7 +961,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-LABEL: v_permlane16_b32_vvv_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -981,7 +981,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-LABEL: v_permlane16_b32_vvv_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1009,33 +1009,33 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vvv_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1044,17 +1044,17 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -1104,7 +1104,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1118,7 +1118,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; ; GFX11-LABEL: v_permlane16_b32_vvv_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; ; GFX12-LABEL: v_permlane16_b32_vvv_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1179,7 +1179,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1201,12 +1201,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -1215,11 +1215,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -1228,12 +1229,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -1242,11 +1243,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -1262,102 +1264,70 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s2 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlane16_b32_vvs_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: v_permlane16_b32_vvs_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -1367,7 +1337,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -1378,7 +1348,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1389,12 +1359,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -1403,11 +1373,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -1416,12 +1387,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -1430,11 +1401,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -1450,102 +1422,70 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s2 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlane16_b32_vvs_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: v_permlane16_b32_vvs_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -1555,7 +1495,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -1566,7 +1506,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1577,7 +1517,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1592,7 +1532,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1607,7 +1547,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1622,7 +1562,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1644,40 +1584,38 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s0 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1695,8 +1633,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1714,8 +1652,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1733,8 +1671,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1757,7 +1695,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -1768,7 +1706,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1779,7 +1717,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1794,7 +1732,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1809,7 +1747,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1824,7 +1762,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1846,40 +1784,38 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s0 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1897,8 +1833,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1916,8 +1852,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1935,8 +1871,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1960,20 +1896,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3 ; GFX10-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1986,8 +1922,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3 ; GFX12-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2005,36 +1941,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2049,8 +1985,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2065,8 +2001,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2081,8 +2017,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2102,20 +2038,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl ; GFX10-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2128,8 +2064,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl ; GFX12-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2147,36 +2083,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2191,8 +2127,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2207,8 +2143,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2223,8 +2159,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2244,20 +2180,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3 ; GFX10-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2270,8 +2206,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3 ; GFX12-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2289,36 +2225,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2333,8 +2269,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2349,8 +2285,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2365,8 +2301,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2386,20 +2322,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl ; GFX10-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2412,8 +2348,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl ; GFX12-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2431,36 +2367,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2475,8 +2411,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2491,8 +2427,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2507,8 +2443,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2528,20 +2464,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2554,8 +2490,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2573,36 +2509,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2617,8 +2553,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2633,8 +2569,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2649,8 +2585,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2670,20 +2606,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2696,8 +2632,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2715,36 +2651,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2759,8 +2695,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2775,8 +2711,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2791,8 +2727,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2812,20 +2748,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2838,8 +2774,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 ; GFX12-LABEL: v_permlanex16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2857,20 +2793,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vss_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2883,8 +2819,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa ; GFX12-LABEL: v_permlanex16_b32_vss_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2902,36 +2838,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2946,8 +2882,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2962,8 +2898,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2978,8 +2914,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2999,36 +2935,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -3043,8 +2979,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3059,8 +2995,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -3075,8 +3011,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3096,22 +3032,22 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vii_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vii_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -3121,7 +3057,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; ; GFX12-LABEL: v_permlanex16_b32_vii_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3139,22 +3075,22 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vii_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vii_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -3164,7 +3100,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; ; GFX12-LABEL: v_permlanex16_b32_vii_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3181,7 +3117,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3193,7 +3129,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3205,7 +3141,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3219,7 +3155,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3233,7 +3169,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3247,7 +3183,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3266,7 +3202,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3278,7 +3214,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3290,7 +3226,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3304,7 +3240,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3318,7 +3254,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3332,7 +3268,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3353,25 +3289,25 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vll_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vll_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3380,7 +3316,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; ; GFX12-LABEL: v_permlanex16_b32_vll_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -3399,25 +3335,25 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vll_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vll_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3426,7 +3362,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; ; GFX12-LABEL: v_permlanex16_b32_vll_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -3444,7 +3380,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3457,7 +3393,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3470,7 +3406,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3486,7 +3422,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3502,7 +3438,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3518,7 +3454,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3539,7 +3475,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3552,7 +3488,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3565,7 +3501,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3581,7 +3517,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3597,7 +3533,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3613,7 +3549,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3635,33 +3571,33 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vvv_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3670,17 +3606,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3688,7 +3624,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -3706,7 +3642,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -3731,33 +3667,33 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vvv_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3766,17 +3702,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3784,7 +3720,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -3802,7 +3738,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -3826,7 +3762,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -3840,7 +3776,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -3854,7 +3790,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; ; GFX11-LABEL: v_permlanex16_b32_vvv_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -3874,7 +3810,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; ; GFX12-LABEL: v_permlanex16_b32_vvv_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -3901,7 +3837,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -3915,7 +3851,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -3929,7 +3865,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: v_permlanex16_b32_vvv_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -3949,7 +3885,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; ; GFX12-LABEL: v_permlanex16_b32_vvv_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -3976,7 +3912,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -3987,7 +3923,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3998,12 +3934,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -4012,11 +3948,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -4025,12 +3962,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -4039,11 +3976,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -4058,7 +3996,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -4069,7 +4007,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4080,12 +4018,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -4094,11 +4032,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -4107,12 +4046,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -4121,11 +4060,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -4141,102 +4081,70 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s2 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_permlanex16_b32_vvs_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: v_permlanex16_b32_vvs_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -4247,102 +4155,70 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s2 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f64: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlanex16_b32_vvs_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f64: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: v_permlanex16_b32_vvs_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -4352,7 +4228,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -4363,7 +4239,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4374,7 +4250,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4389,7 +4265,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4404,7 +4280,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4419,7 +4295,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4440,7 +4316,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -4451,7 +4327,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4462,7 +4338,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4477,7 +4353,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4492,7 +4368,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4507,7 +4383,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4529,40 +4405,38 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s0 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4580,8 +4454,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4599,8 +4473,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4618,8 +4492,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4643,40 +4517,38 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s0 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4694,8 +4566,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4713,8 +4585,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4732,8 +4604,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4757,20 +4629,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4783,8 +4655,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4802,20 +4674,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f ; GFX10-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4828,8 +4700,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f ; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4847,36 +4719,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4891,8 +4763,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4907,8 +4779,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4923,8 +4795,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4944,36 +4816,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4988,8 +4860,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5004,8 +4876,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5020,8 +4892,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5041,20 +4913,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5067,8 +4939,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5086,20 +4958,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f ; GFX10-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5112,8 +4984,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f ; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5131,36 +5003,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5175,8 +5047,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5191,8 +5063,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5207,8 +5079,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5228,36 +5100,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5272,8 +5144,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5288,8 +5160,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5304,8 +5176,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5325,20 +5197,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5351,8 +5223,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5370,20 +5242,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5396,8 +5268,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5415,36 +5287,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5459,8 +5331,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5475,8 +5347,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5491,8 +5363,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5512,36 +5384,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5556,8 +5428,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5572,8 +5444,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5588,8 +5460,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5609,23 +5481,24 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5633,12 +5506,13 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5652,23 +5526,24 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5676,12 +5551,13 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5696,40 +5572,41 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -5737,14 +5614,15 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -5752,14 +5630,15 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -5767,14 +5646,15 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -5790,12 +5670,12 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -5803,75 +5683,79 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -5887,23 +5771,24 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5911,12 +5796,13 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5931,23 +5817,24 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5955,12 +5842,13 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5976,38 +5864,39 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6015,14 +5904,15 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6030,14 +5920,15 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6045,14 +5936,15 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6069,12 +5961,12 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -6082,75 +5974,79 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6167,23 +6063,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -6191,13 +6087,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6205,14 +6102,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6220,13 +6118,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6234,14 +6133,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6255,23 +6155,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -6279,13 +6179,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6293,14 +6194,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6308,13 +6210,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6322,14 +6225,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6344,43 +6248,43 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6388,15 +6292,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6404,15 +6308,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6420,15 +6324,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6444,14 +6348,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -6459,81 +6363,85 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 -; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 -; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6549,23 +6457,24 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6573,12 +6482,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6593,23 +6503,24 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6617,12 +6528,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6638,38 +6550,39 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6677,14 +6590,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6692,14 +6606,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6707,14 +6622,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6731,12 +6647,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -6744,75 +6660,79 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6829,23 +6749,24 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6853,12 +6774,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6873,23 +6795,24 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6897,12 +6820,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6918,38 +6842,39 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6957,14 +6882,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6972,14 +6898,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6987,14 +6914,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7011,12 +6939,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -7024,75 +6952,79 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7109,23 +7041,24 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7133,12 +7066,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7153,23 +7087,24 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7177,12 +7112,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7198,38 +7134,39 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7237,14 +7174,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7252,14 +7190,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7267,14 +7206,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7291,12 +7231,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -7304,75 +7244,79 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7389,23 +7333,24 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7413,12 +7358,13 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7432,23 +7378,24 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7456,12 +7403,13 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7476,40 +7424,41 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7517,14 +7466,15 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7532,14 +7482,15 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7547,14 +7498,15 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7570,12 +7522,12 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -7583,75 +7535,79 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7667,23 +7623,24 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7691,12 +7648,13 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7711,23 +7669,24 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7735,12 +7694,13 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7756,38 +7716,39 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7795,14 +7756,15 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7810,14 +7772,15 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7825,14 +7788,15 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7849,12 +7813,12 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -7862,75 +7826,79 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7947,23 +7915,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -7971,13 +7939,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7985,14 +7954,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8000,13 +7970,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8014,14 +7985,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8035,23 +8007,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -8059,13 +8031,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8073,14 +8046,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8088,13 +8062,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8102,14 +8077,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8124,43 +8100,43 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8168,15 +8144,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8184,15 +8160,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8200,15 +8176,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8224,14 +8200,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 ; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -8239,81 +8215,85 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 -; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 -; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8329,23 +8309,24 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8353,12 +8334,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8373,23 +8355,24 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8397,12 +8380,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8418,38 +8402,39 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8457,14 +8442,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8472,14 +8458,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8487,14 +8474,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8511,12 +8499,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -8524,75 +8512,79 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8609,23 +8601,24 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8633,12 +8626,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8653,23 +8647,24 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8677,12 +8672,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8698,38 +8694,39 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8737,14 +8734,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8752,14 +8750,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8767,14 +8766,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8791,12 +8791,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -8804,75 +8804,79 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8889,23 +8893,24 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8913,12 +8918,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8933,23 +8939,24 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8957,12 +8964,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8978,38 +8986,39 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -9017,14 +9026,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -9032,14 +9042,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -9047,14 +9058,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -9071,12 +9083,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -9084,75 +9096,79 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll index 973678291e263..a65143255bbb4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -41,7 +41,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -54,7 +54,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -72,7 +72,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -85,7 +85,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -103,7 +103,8 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -115,9 +116,9 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -134,7 +135,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -147,7 +148,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -165,7 +166,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -178,7 +179,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -196,7 +197,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -209,7 +210,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -227,7 +228,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -240,7 +241,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -258,7 +259,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -271,7 +272,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -289,7 +290,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -302,7 +303,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -320,7 +321,8 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -332,9 +334,9 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -351,7 +353,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -364,7 +366,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -382,7 +384,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -395,7 +397,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -413,7 +415,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -426,7 +428,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -445,10 +447,11 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -459,10 +462,10 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -480,10 +483,11 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -494,10 +498,10 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -516,11 +520,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -531,10 +535,12 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -552,10 +558,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -566,10 +573,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -588,10 +595,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -602,10 +610,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -624,10 +632,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -638,10 +647,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -660,10 +669,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -674,10 +684,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -695,10 +705,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -709,10 +720,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -731,11 +742,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -746,10 +757,12 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -767,10 +780,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -781,10 +795,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -803,10 +817,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -817,10 +832,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -839,10 +854,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -853,10 +869,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index f653baa7365c7..abb2f87778187 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -5,114 +5,29 @@ declare i32 @llvm.amdgcn.permlane64(i32) declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) { -; GFX11-LABEL: test_s_i32: +define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { +; GFX11-LABEL: test_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0) + %v = call i32 @llvm.amdgcn.permlane64(i32 %src0) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_s_f32(ptr addrspace(1) %out, float %src0) { -; GFX11-LABEL: test_s_f32: +define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { +; GFX11-LABEL: test_i: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane64_b32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlane64.f32(float %src0) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) { -; GFX11-SDAG-LABEL: test_s_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: test_s_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) { -; GFX11-SDAG-LABEL: test_s_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: test_s_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlane64.f64(double %src0) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) { -; GFX11-LABEL: test_i_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 @@ -121,16 +36,16 @@ define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64.i32(i32 99) + %v = call i32 @llvm.amdgcn.permlane64(i32 99) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) { -; GFX11-LABEL: test_i_f32: +define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { +; GFX11-LABEL: test_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -138,314 +53,11 @@ define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %v = call float @llvm.amdgcn.permlane64.f32(float 1234.5) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) { -; GFX11-SDAG-LABEL: test_i_i64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: test_i_i64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm - %v = call i64 @llvm.amdgcn.permlane64.i64(i64 99) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) { -; GFX11-SDAG-LABEL: test_i_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: test_i_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm - %v = call double @llvm.amdgcn.permlane64.f64(double 1234.5) - store double %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 { -; GFX11-SDAG-LABEL: test_v_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: test_v_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %tidx) + %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx) store i32 %v, ptr addrspace(1) %out ret void } - -define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 { -; GFX11-SDAG-LABEL: test_v_f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: test_v_f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %v = call float @llvm.amdgcn.permlane64.f32(float %tidx_f32) - store float %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 { -; GFX11-LABEL: test_v_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: v_permlane64_b32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane64_b32 v1, v2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_i64 = zext i32 %tidx to i64 - %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %tidx_i64) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 { -; GFX11-SDAG-LABEL: test_v_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: test_v_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidx_f32 = bitcast i32 %tidx to float - %tidx_f64 = fpext float %tidx_f32 to double - %v = call double @llvm.amdgcn.permlane64.f64(double %tidx_f64) - store double %v, ptr addrspace(1) %out - ret void -} - -define void @test_half(ptr addrspace(1) %out, half %src0) { -; GFX11-LABEL: test_half: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_permlane64_b32 v2, v2 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] - %v = call half @llvm.amdgcn.permlane64.f16(half %src0) - store half %v, ptr addrspace(1) %out - ret void -} - -define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) { -; GFX11-LABEL: test_bfloat: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_permlane64_b32 v2, v2 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] - %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0) - store bfloat %v, ptr addrspace(1) %out - ret void -} - -define void @test_i16(ptr addrspace(1) %out, i16 %src0) { -; GFX11-LABEL: test_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_permlane64_b32 v2, v2 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] - %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0) - store i16 %v, ptr addrspace(1) %out - ret void -} - -define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) { -; GFX11-LABEL: test_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_permlane64_b32 v2, v2 -; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] - %v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0) - store <2 x half> %v, ptr addrspace(1) %out - ret void -} - -define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) { -; GFX11-SDAG-LABEL: test_v2f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 -; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: test_v2f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 -; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0) - store <2 x float> %v, ptr addrspace(1) %out - ret void -} - -define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) { -; GFX11-SDAG-LABEL: test_v7i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 -; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 -; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 -; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 -; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 -; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: test_v7i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 -; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 -; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 -; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 -; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 -; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %v = call <7 x i32> @llvm.amdgcn.permlane64.v7i32(<7 x i32> %src0) - store <7 x i32> %v, ptr addrspace(1) %out - ret void -} - -define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) { -; GFX11-SDAG-LABEL: test_v8i16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 -; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 -; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: test_v8i16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 -; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 -; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 -; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0) - store <8 x i16> %v, ptr addrspace(1) %out - ret void -} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll index 2070a832e0fcd..afa3fe8c2f1fb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) { ; GFX11-SDAG-LABEL: test_p0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -24,13 +24,13 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-LABEL: test_v3p0: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x2 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s6 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s1 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1 @@ -40,8 +40,8 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 -; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[2:3] offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[2:3] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -54,10 +54,10 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0 ; GFX11-SDAG-LABEL: test_p3: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -73,8 +73,8 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3 ; GFX11-SDAG-LABEL: test_v3p3: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 @@ -97,10 +97,10 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0 ; GFX11-SDAG-LABEL: test_p5: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -116,8 +116,8 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5 ; GFX11-SDAG-LABEL: test_v3p5: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 @@ -140,10 +140,10 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0 ; GFX11-SDAG-LABEL: test_p6: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -159,8 +159,8 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6 ; GFX11-SDAG-LABEL: test_v3p6: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll index 36d2319788713..7e16358f74181 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll @@ -4,7 +4,7 @@ ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target ; GCN-LABEL: {{^}}test: -; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 ; GCN: .amdhsa_user_sgpr_queue_ptr 1 define amdgpu_kernel void @test(ptr addrspace(1) %out) { %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 @@ -13,9 +13,21 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out) { ret void } +; FIXME: Should really be able to delete the load +; GCN-LABEL: {{^}}test_ub: +; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 +; GCN: .amdhsa_user_sgpr_queue_ptr 0 +define amdgpu_kernel void @test_ub(ptr addrspace(1) %out) #1 { + %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 + %value = load i32, ptr addrspace(4) %queue_ptr + store i32 %value, ptr addrspace(1) %out + ret void +} + declare noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 attributes #0 = { nounwind readnone } +attributes #1 = { "amdgpu-no-queue-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll index 5d9daae69e786..9f0b420a0a828 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -5,7 +5,11 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_ ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen offset:24 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 @@ -17,7 +21,11 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sg ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -28,7 +36,11 @@ define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffse ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -39,7 +51,11 @@ define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__ ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) @@ -50,7 +66,11 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_ ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen slc +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll index 9becefa33a8f2..320b0b4508b6a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll @@ -10,7 +10,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen offset:128 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 @@ -26,7 +26,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2) @@ -41,7 +41,7 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen offset:128 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 %unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -56,7 +56,7 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll index 9ac6b6a1d0ff9..ce46e2755ae58 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll @@ -8,21 +8,29 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen scc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen sc1 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -33,7 +41,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen scope:SCOPE_SYS +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24) ret void @@ -43,21 +51,29 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 +; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -68,7 +84,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 +; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void @@ -78,21 +94,29 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX908-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -103,7 +127,7 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -113,21 +137,29 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs ; GFX908-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -138,7 +170,7 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0) ret void @@ -148,21 +180,29 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen slc +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen slc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -173,7 +213,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll index fc4449886d954..327d80a7b67cd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll @@ -7,14 +7,18 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen glc scc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen sc0 sc1 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -25,7 +29,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24) @@ -36,14 +40,18 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_ ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 glc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -54,7 +62,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) @@ -65,14 +73,18 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen glc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -83,7 +95,7 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -94,14 +106,18 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 glc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -112,7 +128,7 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0) @@ -123,14 +139,18 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen glc slc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc0 nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -141,7 +161,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll index 3c800d0369e70..3ecbe3c71d022 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll @@ -9,7 +9,11 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -17,21 +21,33 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -49,7 +65,11 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -58,21 +78,33 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -90,7 +122,11 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 @@ -101,21 +137,33 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -139,7 +187,11 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 @@ -154,21 +206,33 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX8-NEXT: s_mov_b32 s11, s17 +; GFX8-NEXT: s_mov_b32 s10, s16 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s11, s17 +; GFX9-NEXT: s_mov_b32 s10, s16 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index 4d557c76dc4d0..cc1547eaad830 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -1180,14 +1180,22 @@ define double @buffer_load_f64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %v ; PREGFX10-LABEL: buffer_load_f64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_f64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1206,14 +1214,22 @@ define <2 x double> @buffer_load_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc ; PREGFX10-LABEL: buffer_load_v2f64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2f64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1232,14 +1248,22 @@ define i64 @buffer_load_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voff ; PREGFX10-LABEL: buffer_load_i64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_i64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1258,14 +1282,22 @@ define <2 x i64> @buffer_load_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, i ; PREGFX10-LABEL: buffer_load_v2i64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2i64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1284,14 +1316,22 @@ define ptr @buffer_load_p0__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffs ; PREGFX10-LABEL: buffer_load_p0__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p0__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1310,14 +1350,22 @@ define <2 x ptr> @buffer_load_v2p0__voffset_add(ptr addrspace(8) inreg %rsrc, i3 ; PREGFX10-LABEL: buffer_load_v2p0__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p0__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1336,14 +1384,22 @@ define ptr addrspace(1) @buffer_load_p1__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p1__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p1__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1362,14 +1418,22 @@ define <2 x ptr addrspace(1)> @buffer_load_v2p1__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p1__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p1__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1388,14 +1452,22 @@ define ptr addrspace(4) @buffer_load_p4__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p4__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p4__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1414,14 +1486,22 @@ define <2 x ptr addrspace(4)> @buffer_load_v2p4__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p4__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p4__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1440,14 +1520,22 @@ define ptr addrspace(999) @buffer_load_p999__voffset_add(ptr addrspace(8) inreg ; PREGFX10-LABEL: buffer_load_p999__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p999__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1466,14 +1554,22 @@ define <2 x ptr addrspace(999)> @buffer_load_v2p999__voffset_add(ptr addrspace(8 ; PREGFX10-LABEL: buffer_load_v2p999__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p999__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1492,14 +1588,22 @@ define ptr addrspace(2) @buffer_load_p2__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1518,14 +1622,22 @@ define <2 x ptr addrspace(2)> @buffer_load_v2p2__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1544,7 +1656,11 @@ define <3 x ptr addrspace(2)> @buffer_load_v3p2__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1563,14 +1679,22 @@ define <4 x ptr addrspace(2)> @buffer_load_v4p2__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1589,14 +1713,22 @@ define ptr addrspace(3) @buffer_load_p3__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1615,14 +1747,22 @@ define <2 x ptr addrspace(3)> @buffer_load_v2p3__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1641,7 +1781,11 @@ define <3 x ptr addrspace(3)> @buffer_load_v3p3__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1660,14 +1804,22 @@ define <4 x ptr addrspace(3)> @buffer_load_v4p3__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1686,14 +1838,22 @@ define ptr addrspace(5) @buffer_load_p5__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1712,14 +1872,22 @@ define <2 x ptr addrspace(5)> @buffer_load_v2p5__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1738,7 +1906,11 @@ define <3 x ptr addrspace(5)> @buffer_load_v3p5__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1757,14 +1929,22 @@ define <4 x ptr addrspace(5)> @buffer_load_v4p5__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1783,14 +1963,22 @@ define ptr addrspace(6) @buffer_load_p6__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1809,14 +1997,22 @@ define <2 x ptr addrspace(6)> @buffer_load_v2p6__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1835,7 +2031,11 @@ define <3 x ptr addrspace(6)> @buffer_load_v3p6__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1854,14 +2054,22 @@ define <4 x ptr addrspace(6)> @buffer_load_v4p6__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; PREGFX10-NEXT: s_mov_b32 s11, s17 +; PREGFX10-NEXT: s_mov_b32 s10, s16 +; PREGFX10-NEXT: s_mov_b32 s9, s7 +; PREGFX10-NEXT: s_mov_b32 s8, s6 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll index 4fbb4ec342ff5..d9227724c22a1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -352,14 +352,22 @@ define void @buffer_store_f64__voffset_add(ptr addrspace(8) inreg %rsrc, double ; VERDE-LABEL: buffer_store_f64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_f64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -371,14 +379,22 @@ define void @buffer_store_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2f64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2f64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -390,14 +406,22 @@ define void @buffer_store_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i64 %da ; VERDE-LABEL: buffer_store_i64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_i64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -409,14 +433,22 @@ define void @buffer_store_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2i64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2i64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -428,14 +460,22 @@ define void @buffer_store_p0__voffset_add(ptr addrspace(8) inreg %rsrc, ptr %dat ; VERDE-LABEL: buffer_store_p0__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p0__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -447,14 +487,22 @@ define void @buffer_store_v2p0__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p0__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p0__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -466,14 +514,22 @@ define void @buffer_store_p1__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p1__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p1__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -485,14 +541,22 @@ define void @buffer_store_v2p1__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p1__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p1__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -504,14 +568,22 @@ define void @buffer_store_p4__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p4__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p4__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -523,14 +595,22 @@ define void @buffer_store_v2p4__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p4__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p4__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -542,14 +622,22 @@ define void @buffer_store_p999__voffset_add(ptr addrspace(8) inreg %rsrc, ptr ad ; VERDE-LABEL: buffer_store_p999__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p999__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -561,14 +649,22 @@ define void @buffer_store_v2p999__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2p999__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p999__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -580,14 +676,22 @@ define void @buffer_store_p2__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -599,14 +703,22 @@ define void @buffer_store_v2p2__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -618,14 +730,22 @@ define void @buffer_store_v3p2__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -637,14 +757,22 @@ define void @buffer_store_v4p2__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -656,14 +784,22 @@ define void @buffer_store_p3__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -675,14 +811,22 @@ define void @buffer_store_v2p3__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -694,14 +838,22 @@ define void @buffer_store_v3p3__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -713,14 +865,22 @@ define void @buffer_store_v4p3__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -732,14 +892,22 @@ define void @buffer_store_p5__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -751,14 +919,22 @@ define void @buffer_store_v2p5__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -770,14 +946,22 @@ define void @buffer_store_v3p5__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -789,14 +973,22 @@ define void @buffer_store_v4p5__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -808,14 +1000,22 @@ define void @buffer_store_p6__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -827,14 +1027,22 @@ define void @buffer_store_v2p6__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -846,14 +1054,22 @@ define void @buffer_store_v3p6__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -865,14 +1081,22 @@ define void @buffer_store_v4p6__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; VERDE-NEXT: s_mov_b32 s11, s17 +; VERDE-NEXT: s_mov_b32 s10, s16 +; VERDE-NEXT: s_mov_b32 s9, s7 +; VERDE-NEXT: s_mov_b32 s8, s6 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll index cb511c93f67ed..30f04f1ff220c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] @@ -17,28 +17,28 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -53,8 +53,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff @@ -65,28 +65,28 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -101,8 +101,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 @@ -115,32 +115,32 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 +; PREGFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -158,8 +158,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff @@ -174,30 +174,30 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index 01df7634f0e9c..a241bdeaff1a7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] @@ -19,28 +19,28 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -51,8 +51,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; GFX12-PACKED-LABEL: tbuffer_store_d16_x: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] @@ -67,8 +67,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff @@ -79,28 +79,28 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -111,8 +111,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] @@ -127,8 +127,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 @@ -141,32 +141,32 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 +; PREGFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -179,8 +179,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body ; GFX12-PACKED-SDAG-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 @@ -193,8 +193,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body ; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4 ; GFX12-PACKED-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -213,8 +213,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff @@ -229,30 +229,30 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -264,8 +264,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index cc6c630ae6466..b061d53de5d3c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -151,7 +151,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -161,7 +161,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -187,7 +187,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -204,7 +204,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +215,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -233,7 +233,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_m0: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -246,7 +246,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { ; ; CHECK-GISEL-LABEL: test_readfirstlane_m0: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -265,7 +265,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -297,7 +297,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -311,7 +311,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -331,7 +331,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -345,7 +345,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -365,7 +365,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_fi: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s9 +; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s15 ; CHECK-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-SDAG-NEXT: s_mov_b32 s4, 0 ; CHECK-SDAG-NEXT: ;;#ASMSTART @@ -375,7 +375,7 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { ; ; CHECK-GISEL-LABEL: test_readfirstlane_fi: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s9 +; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s15 ; CHECK-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s4, 0 ; CHECK-GISEL-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 66e1f9396de5a..24a332fa211c1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -9,7 +9,7 @@ declare double @llvm.amdgcn.readlane.f64(double, i32) #0 define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[0:1] @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] @@ -55,7 +55,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[0:1] @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x4 +; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x4 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -91,7 +91,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s0, s[4:5], 0x4 +; CHECK-GISEL-NEXT: s_load_dword s0, s[6:7], 0x4 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -110,7 +110,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -124,7 +124,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8 +; CHECK-GISEL-NEXT: s_load_dword s1, s[6:7], 0x8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -144,7 +144,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -158,7 +158,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8 +; CHECK-GISEL-NEXT: s_load_dword s1, s[6:7], 0x8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -178,7 +178,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -188,7 +188,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -214,7 +214,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -231,7 +231,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -260,7 +260,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -309,7 +309,7 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -330,7 +330,7 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -363,7 +363,7 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -384,7 +384,7 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -418,7 +418,7 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_m0_sreg: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -431,7 +431,7 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; ; CHECK-GISEL-LABEL: test_readlane_m0_sreg: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -450,7 +450,7 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -464,7 +464,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -484,7 +484,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -500,7 +500,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -522,7 +522,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -538,7 +538,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -560,7 +560,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -573,7 +573,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -592,7 +592,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -606,7 +606,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -626,7 +626,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -640,7 +640,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index f52461b6b3807..e2f494283a3f2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -18,21 +18,21 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8DAGISEL-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8DAGISEL-NEXT: s_endpgm ; ; GFX8GISEL-LABEL: uniform_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -40,54 +40,54 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: uniform_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9DAGISEL-NEXT: s_endpgm ; ; GFX9GISEL-LABEL: uniform_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9GISEL-NEXT: s_endpgm ; ; GFX10DAGISEL-LABEL: uniform_value: ; GFX10DAGISEL: ; %bb.0: ; %entry ; GFX10DAGISEL-NEXT: s_clause 0x1 -; GFX10DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10DAGISEL-NEXT: s_endpgm ; ; GFX10GISEL-LABEL: uniform_value: ; GFX10GISEL: ; %bb.0: ; %entry ; GFX10GISEL-NEXT: s_clause 0x1 -; GFX10GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: uniform_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1164DAGISEL-NEXT: s_nop 0 ; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -96,11 +96,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-LABEL: uniform_value: ; GFX1164GISEL: ; %bb.0: ; %entry ; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1164GISEL-NEXT: s_nop 0 ; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -109,10 +109,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-LABEL: uniform_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1132DAGISEL-NEXT: s_nop 0 ; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -121,10 +121,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-LABEL: uniform_value: ; GFX1132GISEL: ; %bb.0: ; %entry ; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -138,7 +138,7 @@ entry: define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: const_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -148,7 +148,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: const_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -158,7 +158,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: const_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -167,7 +167,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: const_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10DAGISEL-LABEL: const_value: ; GFX10DAGISEL: ; %bb.0: ; %entry -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -185,7 +185,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10GISEL-LABEL: const_value: ; GFX10GISEL: ; %bb.0: ; %entry -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -194,7 +194,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: const_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -205,7 +205,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: const_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -216,7 +216,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: const_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -226,7 +226,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: const_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -280,7 +280,7 @@ entry: define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -300,7 +300,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX8GISEL-LABEL: divergent_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, 0 ; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -320,7 +320,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: divergent_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 @@ -339,7 +339,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9GISEL-LABEL: divergent_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, 0 ; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -358,7 +358,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1064DAGISEL-LABEL: divergent_value: ; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1064GISEL-LABEL: divergent_value: ; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -396,7 +396,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1032DAGISEL-LABEL: divergent_value: ; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1032GISEL-LABEL: divergent_value: ; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -434,15 +434,17 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164DAGISEL-LABEL: divergent_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -456,14 +458,16 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -478,15 +482,16 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 ; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -500,14 +505,16 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -530,17 +537,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -555,8 +562,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -567,16 +574,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL: ; %bb.0: ; %entry ; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s6, s4 ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -589,8 +596,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8GISEL-NEXT: .LBB4_5: ; %endif -; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -602,17 +609,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -627,8 +634,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -638,16 +645,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL: ; %bb.0: ; %entry ; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s6, s4 ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -660,8 +667,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9GISEL-NEXT: .LBB4_5: ; %endif -; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -672,17 +679,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -697,8 +704,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -708,16 +715,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL: ; %bb.0: ; %entry ; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -730,8 +737,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064GISEL-NEXT: .LBB4_5: ; %endif -; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -741,34 +748,34 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-LABEL: divergent_cfg: ; GFX1032DAGISEL: ; %bb.0: ; %entry ; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032DAGISEL-NEXT: s_mov_b32 s3, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 ; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032DAGISEL-NEXT: s_max_u32 s3, s3, s6 +; GFX1032DAGISEL-NEXT: s_max_u32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -777,52 +784,54 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-LABEL: divergent_cfg: ; GFX1032GISEL: ; %bb.0: ; %entry ; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 -; GFX1032GISEL-NEXT: ; implicit-def: $sgpr2 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else -; GFX1032GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mov_b32 s2, s2 +; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 ; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s3, s3 +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 ; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032GISEL-NEXT: s_max_u32 s2, s2, s6 +; GFX1032GISEL-NEXT: s_max_u32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032GISEL-NEXT: .LBB4_5: ; %endif -; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX1032GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec ; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -838,8 +847,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -849,18 +858,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_cfg: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec ; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3] +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -874,8 +885,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164GISEL-NEXT: .LBB4_5: ; %endif -; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -886,36 +897,38 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_cfg: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132DAGISEL-NEXT: s_mov_b32 s3, 0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 ; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132DAGISEL-NEXT: s_max_u32 s3, s3, s6 +; GFX1132DAGISEL-NEXT: s_max_u32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -925,36 +938,38 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_cfg: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX1132GISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else -; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mov_b32 s2, s2 +; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 ; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s3, s3 +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 ; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s6 +; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132GISEL-NEXT: .LBB4_5: ; %endif -; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index bfdb2da6dc6a4..5304188e02f84 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -19,21 +19,21 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8DAGISEL-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8DAGISEL-NEXT: s_endpgm ; ; GFX8GISEL-LABEL: uniform_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -41,54 +41,54 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: uniform_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9DAGISEL-NEXT: s_endpgm ; ; GFX9GISEL-LABEL: uniform_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9GISEL-NEXT: s_endpgm ; ; GFX10DAGISEL-LABEL: uniform_value: ; GFX10DAGISEL: ; %bb.0: ; %entry ; GFX10DAGISEL-NEXT: s_clause 0x1 -; GFX10DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10DAGISEL-NEXT: s_endpgm ; ; GFX10GISEL-LABEL: uniform_value: ; GFX10GISEL: ; %bb.0: ; %entry ; GFX10GISEL-NEXT: s_clause 0x1 -; GFX10GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: uniform_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1164DAGISEL-NEXT: s_nop 0 ; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -97,11 +97,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-LABEL: uniform_value: ; GFX1164GISEL: ; %bb.0: ; %entry ; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1164GISEL-NEXT: s_nop 0 ; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -110,10 +110,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-LABEL: uniform_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1132DAGISEL-NEXT: s_nop 0 ; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -122,10 +122,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-LABEL: uniform_value: ; GFX1132GISEL: ; %bb.0: ; %entry ; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -139,7 +139,7 @@ entry: define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: const_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -149,7 +149,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: const_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -159,7 +159,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: const_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -168,7 +168,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: const_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -177,7 +177,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10DAGISEL-LABEL: const_value: ; GFX10DAGISEL: ; %bb.0: ; %entry -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10GISEL-LABEL: const_value: ; GFX10GISEL: ; %bb.0: ; %entry -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -195,7 +195,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: const_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -206,7 +206,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: const_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -217,7 +217,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: const_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -227,7 +227,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: const_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -281,7 +281,7 @@ entry: define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: divergent_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -301,7 +301,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: divergent_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, -1 ; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: divergent_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1 @@ -340,7 +340,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: divergent_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, -1 ; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -359,7 +359,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1064DAGISEL-LABEL: divergent_value: ; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1 @@ -378,7 +378,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1064GISEL-LABEL: divergent_value: ; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -397,7 +397,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1032DAGISEL-LABEL: divergent_value: ; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1 @@ -416,7 +416,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1032GISEL-LABEL: divergent_value: ; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -435,15 +435,17 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: divergent_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -457,14 +459,16 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: divergent_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -479,15 +483,16 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: divergent_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1 ; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -501,14 +506,16 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: divergent_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -531,17 +538,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -556,8 +563,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -568,16 +575,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL: ; %bb.0: ; %entry ; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s6, s4 ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -590,8 +597,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8GISEL-NEXT: .LBB4_5: ; %endif -; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -603,17 +610,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -628,8 +635,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -639,16 +646,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL: ; %bb.0: ; %entry ; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s6, s4 ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -661,8 +668,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9GISEL-NEXT: .LBB4_5: ; %endif -; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -673,17 +680,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -698,8 +705,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -709,16 +716,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL: ; %bb.0: ; %entry ; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -731,8 +738,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064GISEL-NEXT: .LBB4_5: ; %endif -; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -742,34 +749,34 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-LABEL: divergent_cfg: ; GFX1032DAGISEL: ; %bb.0: ; %entry ; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032DAGISEL-NEXT: s_mov_b32 s3, -1 +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1 ; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032DAGISEL-NEXT: s_min_u32 s3, s3, s6 +; GFX1032DAGISEL-NEXT: s_min_u32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -778,52 +785,54 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-LABEL: divergent_cfg: ; GFX1032GISEL: ; %bb.0: ; %entry ; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 -; GFX1032GISEL-NEXT: ; implicit-def: $sgpr2 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else -; GFX1032GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mov_b32 s2, s2 +; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 ; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s3, s3 +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 +; GFX1032GISEL-NEXT: s_mov_b32 s0, -1 ; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032GISEL-NEXT: s_min_u32 s2, s2, s6 +; GFX1032GISEL-NEXT: s_min_u32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032GISEL-NEXT: .LBB4_5: ; %endif -; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX1032GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec ; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -839,8 +848,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -850,18 +859,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_cfg: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec ; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3] +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -875,8 +886,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164GISEL-NEXT: .LBB4_5: ; %endif -; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -887,36 +898,38 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_cfg: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132DAGISEL-NEXT: s_mov_b32 s3, -1 +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 ; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132DAGISEL-NEXT: s_min_u32 s3, s3, s6 +; GFX1132DAGISEL-NEXT: s_min_u32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -926,36 +939,38 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_cfg: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX1132GISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else -; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mov_b32 s2, s2 +; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 ; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s3, s3 +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 +; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 ; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s6 +; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132GISEL-NEXT: .LBB4_5: ; %endif -; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 47c021769aa56..d521a6c25e462 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -10,102 +10,104 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT0-LABEL: test_barrier: ; VARIANT0: ; %bb.0: ; %entry -; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; VARIANT0-NEXT: s_load_dword s0, s[0:1], 0xb -; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 -; VARIANT0-NEXT: s_mov_b32 s6, 0 +; VARIANT0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; VARIANT0-NEXT: s_load_dword s4, s[2:3], 0xb +; VARIANT0-NEXT: s_mov_b32 s3, 0xf000 +; VARIANT0-NEXT: s_mov_b32 s2, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VARIANT0-NEXT: s_barrier -; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3 +; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 -; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 +; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[0:3], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) -; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; VARIANT0-NEXT: s_endpgm ; ; VARIANT1-LABEL: test_barrier: ; VARIANT1: ; %bb.0: ; %entry -; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; VARIANT1-NEXT: s_load_dword s0, s[0:1], 0xb -; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 -; VARIANT1-NEXT: s_mov_b32 s6, 0 +; VARIANT1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; VARIANT1-NEXT: s_load_dword s4, s[2:3], 0xb +; VARIANT1-NEXT: s_mov_b32 s3, 0xf000 +; VARIANT1-NEXT: s_mov_b32 s2, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; VARIANT1-NEXT: s_barrier -; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3 +; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: s_waitcnt expcnt(0) -; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 +; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[0:3], 0 addr64 ; VARIANT1-NEXT: s_waitcnt vmcnt(0) -; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; VARIANT1-NEXT: s_endpgm ; ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry -; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c +; VARIANT2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VARIANT2-NEXT: s_load_dword s4, s[2:3], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] +; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] ; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 -; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; VARIANT2-NEXT: v_mov_b32_e32 v3, s1 +; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier ; VARIANT2-NEXT: global_load_dword v0, v[0:1], off ; VARIANT2-NEXT: s_waitcnt vmcnt(0) -; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] +; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry -; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c +; VARIANT3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VARIANT3-NEXT: s_load_dword s4, s[2:3], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] +; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] ; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VARIANT3-NEXT: v_mov_b32_e32 v3, s3 -; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; VARIANT3-NEXT: v_mov_b32_e32 v3, s1 +; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT3-NEXT: s_barrier ; VARIANT3-NEXT: global_load_dword v0, v[0:1], off ; VARIANT3-NEXT: s_waitcnt vmcnt(0) -; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] +; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] ; VARIANT3-NEXT: s_endpgm ; ; VARIANT4-LABEL: test_barrier: ; VARIANT4: ; %bb.0: ; %entry -; VARIANT4-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT4-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; VARIANT4-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) +; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v2 ; VARIANT4-NEXT: s_wait_kmcnt 0x0 -; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2 -; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT4-NEXT: v_xad_u32 v0, v2, -1, s2 +; VARIANT4-NEXT: global_store_b32 v3, v2, s[0:1] ; VARIANT4-NEXT: s_wait_storecnt 0x0 ; VARIANT4-NEXT: s_barrier_signal -1 ; VARIANT4-NEXT: s_barrier_wait -1 -; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT4-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT4-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] -; VARIANT4-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 +; VARIANT4-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; VARIANT4-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo -; VARIANT4-NEXT: global_load_b32 v0, v[1:2], off +; VARIANT4-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; VARIANT4-NEXT: global_load_b32 v0, v[0:1], off ; VARIANT4-NEXT: s_wait_loadcnt 0x0 ; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT4-NEXT: s_nop 0 @@ -114,20 +116,22 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT5-LABEL: test_barrier: ; VARIANT5: ; %bb.0: ; %entry -; VARIANT5-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT5-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; VARIANT5-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) +; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v2 ; VARIANT5-NEXT: s_wait_kmcnt 0x0 -; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2 -; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT5-NEXT: v_xad_u32 v0, v2, -1, s2 +; VARIANT5-NEXT: global_store_b32 v3, v2, s[0:1] ; VARIANT5-NEXT: s_barrier_signal -1 ; VARIANT5-NEXT: s_barrier_wait -1 -; VARIANT5-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT5-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT5-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] -; VARIANT5-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 +; VARIANT5-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; VARIANT5-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo -; VARIANT5-NEXT: global_load_b32 v0, v[1:2], off +; VARIANT5-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; VARIANT5-NEXT: global_load_b32 v0, v[0:1], off ; VARIANT5-NEXT: s_wait_loadcnt 0x0 ; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT5-NEXT: s_nop 0 @@ -136,23 +140,24 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT6-LABEL: test_barrier: ; VARIANT6: ; %bb.0: ; %entry -; VARIANT6-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0 +; VARIANT6-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; VARIANT6-NEXT: s_wait_kmcnt 0x0 +; VARIANT6-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_and_b32 v4, 0x3ff, v0 ; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 -; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0 -; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] +; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) +; VARIANT6-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_lshlrev_b32 v5, 2, v4 +; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4 +; VARIANT6-NEXT: global_store_b32 v5, v4, s[0:1] +; VARIANT6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT6-NEXT: s_wait_storecnt 0x0 ; VARIANT6-NEXT: s_barrier_signal -1 ; VARIANT6-NEXT: s_barrier_wait -1 -; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT6-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] -; VARIANT6-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1 +; VARIANT6-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; VARIANT6-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT6-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo -; VARIANT6-NEXT: global_load_b32 v0, v[1:2], off +; VARIANT6-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; VARIANT6-NEXT: global_load_b32 v0, v[0:1], off ; VARIANT6-NEXT: s_wait_loadcnt 0x0 ; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] ; VARIANT6-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 38a34ec6daf73..8bfe996c6a90a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -5,10 +5,11 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -22,10 +23,11 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -51,10 +53,11 @@ entry: define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -68,10 +71,11 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -97,10 +101,11 @@ entry: define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -114,10 +119,11 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -143,12 +149,12 @@ entry: define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_var: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_mov_b32 m0, 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v1, s[0:1] @@ -162,11 +168,13 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v2, 0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -222,8 +230,10 @@ define void @test2_s_barrier_signal_var(i32 %arg) { define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -242,8 +252,10 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -278,8 +290,10 @@ entry: define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -298,8 +312,10 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -334,8 +350,10 @@ entry: define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -354,8 +372,10 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -390,9 +410,11 @@ entry: define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_isfirst_var: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GCN-NEXT: s_mov_b32 m0, 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -411,9 +433,11 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -518,10 +542,11 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test1_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -535,10 +560,11 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -562,10 +588,11 @@ entry: define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test2_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -579,10 +606,11 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -606,10 +634,11 @@ entry: define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test3_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -623,10 +652,11 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -650,15 +680,17 @@ entry: define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 { ; GCN-LABEL: test4_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_barrier_init m0 ; GCN-NEXT: global_store_b32 v3, v0, s[0:1] @@ -668,10 +700,11 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 s3, 16, s3 @@ -732,11 +765,12 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_barrier_join -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_barrier_join -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -746,10 +780,11 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -772,11 +807,12 @@ entry: define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_barrier_join 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_barrier_join 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -786,10 +822,11 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -812,11 +849,12 @@ entry: define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_barrier_join 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_barrier_join 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -826,10 +864,11 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -852,11 +891,11 @@ entry: define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_barrier_join_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 @@ -869,10 +908,11 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 @@ -924,8 +964,10 @@ define void @test5_s_barrier_join_m0(i32 %arg) { define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_leave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_barrier_leave @@ -943,14 +985,16 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_leave: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_barrier_leave ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 ; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GLOBAL-ISEL-NEXT: s_clause 0x1 @@ -978,11 +1022,12 @@ entry: define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_wakeup_barrier -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_wakeup_barrier -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -992,10 +1037,11 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1018,11 +1064,12 @@ entry: define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_wakeup_barrier 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_wakeup_barrier 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1032,10 +1079,11 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1058,11 +1106,12 @@ entry: define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_wakeup_barrier 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: s_wakeup_barrier 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1072,10 +1121,11 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1098,11 +1148,11 @@ entry: define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_wakeup_barrier_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 @@ -1115,10 +1165,11 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 ; ; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 @@ -1170,11 +1221,12 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) { define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s2, -1 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_get_barrier_state s4, -1 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1182,13 +1234,14 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1206,11 +1259,12 @@ entry: define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s2, 1 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_get_barrier_state s4, 1 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1218,13 +1272,14 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1242,11 +1297,12 @@ entry: define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s2, 0 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_get_barrier_state s4, 0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1254,13 +1310,14 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1278,8 +1335,10 @@ entry: define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_get_barrier_state_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1294,8 +1353,10 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i ; ; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1352,10 +1413,11 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) { define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test_barrier_convert: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1369,10 +1431,11 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test_barrier_convert: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll index 4a404af54188d..bc7052132a87b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll @@ -37,7 +37,7 @@ define void @test_s_sleep_var2() { define amdgpu_kernel void @test_s_sleep_var3(i32 %arg) { ; GCN-LABEL: test_s_sleep_var3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GCN-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_sleep_var s0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index c2e74eb05d164..527627a5a2f67 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -5,10 +5,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 ; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 @@ -72,10 +73,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 @@ -175,10 +177,11 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 @@ -256,10 +259,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index fdcb1773d0a3f..a29e2298210a3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -7,11 +7,12 @@ declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16( define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; GCN-NEXT: ds_load_b128 v[8:11], v0 @@ -58,11 +59,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 @@ -147,127 +149,131 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v16, 0x3ff, v0 ; GCN-NEXT: v_mov_b32_e32 v18, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0 -; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1 -; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024 -; GCN-NEXT: ds_load_b128 v[1:4], v17 -; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_lshl_add_u32 v17, v16, 5, s0 +; GCN-NEXT: v_lshl_add_u32 v16, v16, 4, s1 +; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:1024 +; GCN-NEXT: ds_load_b128 v[0:3], v17 +; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x2 -; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v0, v[13:16] -; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560 -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ds_store_b128 v16, v[12:15] +; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:2560 +; GCN-NEXT: v_mov_b32_e32 v16, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:512 -; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:4608 +; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:512 +; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:4608 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1024 -; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:7168 +; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1024 +; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:7168 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1536 -; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:10240 +; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1536 +; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:10240 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:2048 +; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2048 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0 -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1 -; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024 -; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17 -; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v16, 5, s0 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v16, v16, 4, s1 +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 +; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] -; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:2560 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:512 -; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:4608 +; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:512 +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:4608 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1024 -; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:7168 +; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:7168 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1536 -; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:10240 +; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1536 +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:10240 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:2048 +; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2048 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 10f09b6390aba..24b8a3c2dc873 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -29,7 +29,8 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -96,7 +97,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -178,34 +180,27 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 -; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 +; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 +; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 +; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -213,24 +208,33 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 -; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:48 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 +; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 +; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 +; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 +; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v19, v19, v19 ; GCN-NEXT: v_mul_lo_u32 v18, v18, v18 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; GCN-NEXT: v_mul_lo_u32 v17, v17, v17 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 +; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v23, v23, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -241,12 +245,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v22, v22, v22 ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 ; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 ; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 ; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:48 ; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 ; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 ; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] @@ -258,34 +261,27 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -293,24 +289,33 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v19, v19, v19 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v18, v18, v18 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v23, v23, v23 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) @@ -321,12 +326,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v22, v22, v22 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:48 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] @@ -381,23 +385,23 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 +; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 ; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -405,6 +409,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -418,10 +423,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 +; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 +; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -430,8 +442,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -439,53 +450,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -493,6 +498,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -506,10 +512,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -518,8 +531,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -527,31 +539,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -614,8 +620,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 @@ -720,8 +727,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 @@ -862,8 +870,9 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -995,8 +1004,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1188,9 +1198,9 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out, <5 x float> %in1) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 ; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1202,7 +1212,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_add_f32_e32 v4, v6, v4 ; GCN-NEXT: v_exp_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_add_u32_e32 v1, s2, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1277,7 +1288,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_mul_f32_e32 v4, s7, v3 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_rndne_f32_e32 v10, v4 -; GCN-NEXT: s_load_dword s8, s[0:1], 0x54 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x54 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 @@ -1313,7 +1324,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_add_u32_e32 v0, s3, v0 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1324,8 +1335,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ; kill: killed $sgpr2_sgpr3 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1372,9 +1383,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1386,7 +1397,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s2, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1461,7 +1473,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s7, v3 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 -; EXACTCUTOFF-NEXT: s_load_dword s8, s[0:1], 0x54 +; EXACTCUTOFF-NEXT: s_load_dword s8, s[2:3], 0x54 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 @@ -1497,7 +1509,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s3, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1508,8 +1520,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr2_sgpr3 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll index eb30484ea7f19..363c54d4abe90 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_doorbell: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -16,7 +16,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_doorbell: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -32,7 +32,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_ddid: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -43,7 +43,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_ddid: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_tma: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_realtime: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -93,7 +93,7 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_savewave: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -104,7 +104,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_savewave: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -120,7 +120,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_tba: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -137,7 +137,7 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_0_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -148,7 +148,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_0_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -164,7 +164,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_99999_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index fc33206845a71..114d2d099ab7b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -24,7 +24,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 1 @@ -39,7 +39,7 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -61,7 +61,7 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -77,17 +77,17 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s3, 56 +; GCN-NEXT: s_cmp_lg_u32 s4, 56 ; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc1 .LBB4_3 ; GCN-NEXT: ; %bb.1: ; %Flow @@ -127,8 +127,8 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x40400000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -171,8 +171,8 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x10001 @@ -191,8 +191,8 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x3c003c00 @@ -211,7 +211,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -235,7 +235,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1.0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -259,8 +259,8 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x3f803f80 @@ -279,7 +279,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x10001 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -303,7 +303,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -327,7 +327,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -351,7 +351,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -373,8 +373,8 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -392,8 +392,8 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -411,8 +411,8 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -430,8 +430,8 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll index 5401de0b08288..c1f1782ea5a87 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -5,7 +5,11 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffs ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen offset:24 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 @@ -18,7 +22,11 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset_ ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -29,7 +37,11 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffs ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen slc +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen slc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -40,7 +52,11 @@ define void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vof ; CHECK-LABEL: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[4:7], s8 idxen offen offset:24 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s10, s16 +; CHECK-NEXT: s_mov_b32 s9, s7 +; CHECK-NEXT: s_mov_b32 s8, s6 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s18 idxen offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll index e0e4f950cc16c..78204dfefc80c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -10,7 +10,7 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsr ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -25,7 +25,7 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgp ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %unused = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index 864244b6cebcf..1005996003044 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -8,7 +8,11 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -17,7 +21,11 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -26,7 +34,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -37,7 +45,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -48,21 +56,29 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -73,7 +89,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -83,7 +99,11 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen slc +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen slc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -92,7 +112,11 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen slc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -101,7 +125,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -112,7 +136,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -122,7 +146,11 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX908-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[4:7], s8 idxen offen +; GFX908-NEXT: s_mov_b32 s11, s17 +; GFX908-NEXT: s_mov_b32 s10, s16 +; GFX908-NEXT: s_mov_b32 s9, s7 +; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s18 idxen offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -131,7 +159,11 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[4:7], s8 idxen offen +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[8:11], s18 idxen offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -140,7 +172,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s4 idxen offen +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s6 idxen offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -151,7 +183,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen +; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index ba6005e004efc..5f6a67e466020 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -9,7 +9,11 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen glc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -18,7 +22,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -29,7 +33,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -41,14 +45,18 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen glc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -59,7 +67,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -72,7 +80,11 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen glc slc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -81,7 +93,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen sc0 nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -92,7 +104,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -105,7 +117,11 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[4:7], s8 idxen offen glc +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[8:11], s18 idxen offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -114,7 +130,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s4 idxen offen sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s6 idxen offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -125,7 +141,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll index 1fb5d53d5fd82..bd803c380e90a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll @@ -10,28 +10,40 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -42,7 +54,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -53,28 +65,40 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -85,7 +109,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -97,28 +121,40 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s6 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -129,7 +165,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -140,28 +176,40 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen glc slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -172,7 +220,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -183,27 +231,39 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: @@ -213,7 +273,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -223,27 +283,39 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: @@ -253,7 +325,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -265,27 +337,39 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s6 idxen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: @@ -295,7 +379,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -305,27 +389,39 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen slc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen slc ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: @@ -335,7 +431,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -353,14 +449,14 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX6-NEXT: v_readfirstlane_b32 s10, v3 ; GFX6-NEXT: v_readfirstlane_b32 s11, v4 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] -; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -377,14 +473,14 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX7-NEXT: v_readfirstlane_b32 s10, v3 ; GFX7-NEXT: v_readfirstlane_b32 s11, v4 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] -; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] @@ -394,25 +490,25 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: s_mov_b32 s5, exec_lo ; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v1 ; GFX10-NEXT: v_readfirstlane_b32 s9, v2 ; GFX10-NEXT: v_readfirstlane_b32 s10, v3 ; GFX10-NEXT: v_readfirstlane_b32 s11, v4 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2] -; GFX10-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[3:4] -; GFX10-NEXT: s_and_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_and_saveexec_b32 s5, s5 +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[3:4] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll index b859147b6dc6b..4f9bac584a78e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll @@ -9,14 +9,22 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -27,14 +35,22 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -46,14 +62,22 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -64,14 +88,22 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -82,14 +114,22 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -100,14 +140,22 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -120,14 +168,22 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -138,14 +194,22 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -164,14 +228,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-NEXT: v_readfirstlane_b32 s10, v4 ; GFX6-NEXT: v_readfirstlane_b32 s11, v5 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] -; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -188,14 +252,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX7-NEXT: v_readfirstlane_b32 s10, v4 ; GFX7-NEXT: v_readfirstlane_b32 s11, v5 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] -; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll index 87055db9a58f0..c9b50eddc94ee 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll @@ -10,28 +10,40 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -42,7 +54,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -53,28 +65,40 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -85,7 +109,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -97,28 +121,40 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s6 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -129,7 +165,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -140,28 +176,40 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen glc slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -172,7 +220,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -183,27 +231,39 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: @@ -213,7 +273,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -223,27 +283,39 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: @@ -253,7 +325,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -265,27 +337,39 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s6 idxen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: @@ -295,7 +379,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -305,27 +389,39 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc +; GFX10-NEXT: s_mov_b32 s11, s17 +; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen slc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen slc ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: @@ -335,7 +431,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -353,14 +449,14 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX6-NEXT: v_readfirstlane_b32 s10, v3 ; GFX6-NEXT: v_readfirstlane_b32 s11, v4 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] -; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -377,14 +473,14 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX7-NEXT: v_readfirstlane_b32 s10, v3 ; GFX7-NEXT: v_readfirstlane_b32 s11, v4 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] -; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] @@ -394,25 +490,25 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: s_mov_b32 s5, exec_lo ; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v1 ; GFX10-NEXT: v_readfirstlane_b32 s9, v2 ; GFX10-NEXT: v_readfirstlane_b32 s10, v3 ; GFX10-NEXT: v_readfirstlane_b32 s11, v4 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2] -; GFX10-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[3:4] -; GFX10-NEXT: s_and_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_and_saveexec_b32 s5, s5 +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[3:4] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll index 5c23a86dab33a..01bc833d59be7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll @@ -9,14 +9,22 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -27,14 +35,22 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -46,14 +62,22 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -64,14 +88,22 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -82,14 +114,22 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -100,14 +140,22 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -120,14 +168,22 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -138,14 +194,22 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc +; GFX6-NEXT: s_mov_b32 s11, s17 +; GFX6-NEXT: s_mov_b32 s10, s16 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc +; GFX7-NEXT: s_mov_b32 s11, s17 +; GFX7-NEXT: s_mov_b32 s10, s16 +; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -164,14 +228,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-NEXT: v_readfirstlane_b32 s10, v4 ; GFX6-NEXT: v_readfirstlane_b32 s11, v5 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] -; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -188,14 +252,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX7-NEXT: v_readfirstlane_b32 s10, v4 ; GFX7-NEXT: v_readfirstlane_b32 s11, v5 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] -; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll index 439742d6b315d..38fdcf47171af 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll @@ -8,40 +8,40 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -57,43 +57,43 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s4, s6, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s6, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -109,29 +109,29 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s7, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s6, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], v3, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 +; PREGFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -139,13 +139,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -153,9 +153,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -174,30 +174,30 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s7, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s7, s7, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s6, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s5, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s4, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s8 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s7 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s6 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -205,12 +205,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -218,9 +218,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index 22ec22dc2db02..1da076c652399 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -11,40 +11,40 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -56,8 +56,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; GFX12-PACKED-LABEL: tbuffer_store_d16_x: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -73,43 +73,43 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s4, s6, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s6, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -121,8 +121,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -138,29 +138,29 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s7, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s6, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], v3, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 +; PREGFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -168,13 +168,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -182,9 +182,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -198,8 +198,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body ; GFX12-PACKED-SDAG-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 -; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-PACKED-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 +; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 @@ -213,8 +213,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body ; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-PACKED-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 +; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4 ; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -233,30 +233,30 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s7, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s7, s7, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s6, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s5, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s4, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s8 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s7 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s6 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -264,12 +264,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -277,9 +277,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -292,8 +292,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-PACKED-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 0755dcddd8f46..279a64adfbda1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,7 +36,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_arg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -50,7 +50,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -69,7 +69,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_imm_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -83,7 +83,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -102,7 +102,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_imm_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s8, 0x7b @@ -117,7 +117,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_movk_i32 s8, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -137,7 +137,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_reg_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_reg_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -162,7 +162,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_imm_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -172,7 +172,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_imm_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -187,7 +187,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zextload_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -204,7 +204,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: bfe_u32_zextload_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -229,7 +229,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -248,7 +248,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: bfe_u32_zext_in_reg_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -275,7 +275,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -294,7 +294,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: bfe_u32_zext_in_reg_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -321,7 +321,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -341,7 +341,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -389,7 +389,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -417,7 +417,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -437,7 +437,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -465,7 +465,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -484,7 +484,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou ; ; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -511,7 +511,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -529,7 +529,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -553,7 +553,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -563,7 +563,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -580,7 +580,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -590,7 +590,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -607,7 +607,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,7 +617,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -635,7 +635,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -653,7 +653,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -679,7 +679,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -698,7 +698,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -724,7 +724,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -742,7 +742,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -767,7 +767,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -785,7 +785,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -810,7 +810,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -828,7 +828,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -852,7 +852,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -870,7 +870,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -894,7 +894,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -912,7 +912,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -936,7 +936,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -954,7 +954,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -979,7 +979,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -997,7 +997,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1021,7 +1021,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1057,7 +1057,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -1 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1197,7 +1197,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1207,7 +1207,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1222,7 +1222,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1232,7 +1232,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1272,7 +1272,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1307,7 +1307,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1347,7 +1347,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1357,7 +1357,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1382,7 +1382,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 40 @@ -1407,7 +1407,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 40 @@ -1422,7 +1422,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_15: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_15: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1447,7 +1447,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_17: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_17: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1497,7 +1497,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1507,7 +1507,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1526,47 +1526,45 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0, ; SI-LABEL: simplify_bfe_u32_multi_use_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s4, s10 -; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s1, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 63, v0 ; SI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: simplify_bfe_u32_multi_use_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 { @@ -1581,11 +1579,11 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1593,8 +1591,8 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1611,7 +1609,7 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s2, s3 @@ -1625,7 +1623,7 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 ; ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,11 +1643,11 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1657,8 +1655,8 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1675,11 +1673,11 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1687,8 +1685,8 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1705,11 +1703,11 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: shl_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s2, 0x150002 +; SI-NEXT: s_bfe_u32 s4, s4, 0x150002 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1717,8 +1715,8 @@ define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: shl_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll index ab29ca4a99734..abce1f6cd8f84 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll @@ -1,7 +1,8 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o %t.bc +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll index 47f988fc17d28..eaee8ec73fe41 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll @@ -1,9 +1,10 @@ -; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: opt -mtriple=amdgcn-- -passes=amdgpu-attributor -o %t.bc %s +; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %t.bc | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %t.bc | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii < %t.bc | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global < %t.bc | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %t.bc | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %t.bc | FileCheck -check-prefixes=ALL,PACKED-TID %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 31f1085dd76ee..9d93ca65683c4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -14,7 +14,7 @@ declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0 define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1010-SDAG-LABEL: test_writelane_sreg_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s4, s[0:1], 0x0 @@ -40,7 +40,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0 @@ -54,7 +54,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX802-GISEL-LABEL: test_writelane_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -68,7 +68,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1010-GISEL-LABEL: test_writelane_sreg_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s4, s[0:1], 0x0 @@ -80,7 +80,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -118,24 +118,24 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX1010-SDAG-LABEL: test_writelane_sreg_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s8 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -151,8 +151,8 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; ; GFX802-GISEL-LABEL: test_writelane_sreg_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -169,24 +169,24 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX1010-GISEL-LABEL: test_writelane_sreg_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s8 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -208,8 +208,8 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -226,24 +226,24 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX1010-SDAG-LABEL: test_writelane_sreg_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s8 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -259,8 +259,8 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; ; GFX802-GISEL-LABEL: test_writelane_sreg_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -277,24 +277,24 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX1010-GISEL-LABEL: test_writelane_sreg_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s8 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -316,8 +316,8 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -331,8 +331,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -345,23 +345,23 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s0 -; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s2 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -375,8 +375,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -389,15 +389,15 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s0 -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s2 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -410,8 +410,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -427,41 +427,41 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s6 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s4 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s4 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s4 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -477,33 +477,33 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s6 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s4 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s4 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s4 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -516,8 +516,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 ; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -535,8 +535,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -544,35 +544,35 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX1010-SDAG-NEXT: s_mov_b32 s2, 0x40400000 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s6 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s4 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1100-SDAG-NEXT: s_mov_b32 s0, 0x40400000 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX1100-SDAG-NEXT: s_mov_b32 s2, 0x40400000 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s2, s4 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s4 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 ; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -590,8 +590,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -599,26 +599,26 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1010-GISEL-NEXT: s_mov_b32 s2, 0x40400000 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 0, s6 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 0, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s2, s4 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1100-GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1100-GISEL-NEXT: s_mov_b32 s2, 0x40400000 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 0, s4 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -631,7 +631,7 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -654,7 +654,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:4 @@ -671,7 +671,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 @@ -690,7 +692,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -714,7 +716,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dword v0, v0, s[2:3] offset:4 @@ -731,7 +733,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 @@ -760,7 +764,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -785,7 +789,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -804,9 +808,11 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8 ; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -825,7 +831,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -850,7 +856,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8 @@ -868,7 +874,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8 @@ -899,7 +907,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -926,7 +934,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -946,9 +954,11 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8 ; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -968,7 +978,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -995,7 +1005,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8 @@ -1014,7 +1024,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8 @@ -1047,8 +1059,8 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX802-SDAG-NEXT: ;;#ASMSTART ; GFX802-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX802-SDAG-NEXT: ;;#ASMEND @@ -1067,8 +1079,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1010-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: ;;#ASMSTART ; GFX1010-SDAG-NEXT: s_mov_b32 m0, -1 @@ -1084,26 +1096,26 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1100-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: ;;#ASMSTART ; GFX1100-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX1100-SDAG-NEXT: ;;#ASMEND ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, m0, s0 -; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, m0, s2 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX802-GISEL-NEXT: ;;#ASMSTART ; GFX802-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX802-GISEL-NEXT: ;;#ASMEND @@ -1122,8 +1134,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1010-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX1010-GISEL-NEXT: ;;#ASMSTART ; GFX1010-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX1010-GISEL-NEXT: ;;#ASMEND @@ -1139,18 +1151,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1100-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX1100-GISEL-NEXT: ;;#ASMSTART ; GFX1100-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX1100-GISEL-NEXT: ;;#ASMEND ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, m0, s0 -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, m0, s2 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -1164,8 +1176,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1179,8 +1191,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1010-SDAG-LABEL: test_writelane_imm_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1193,23 +1205,23 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1100-SDAG-LABEL: test_writelane_imm_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, 32 -; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1223,8 +1235,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1010-GISEL-LABEL: test_writelane_imm_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1237,15 +1249,15 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1100-GISEL-LABEL: test_writelane_imm_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, 32 -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -1258,7 +1270,7 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1273,7 +1285,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1010-SDAG-LABEL: test_writelane_imm_i64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1287,7 +1299,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1100-SDAG-LABEL: test_writelane_imm_i64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1303,7 +1315,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX802-GISEL-LABEL: test_writelane_imm_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1318,7 +1330,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1010-GISEL-LABEL: test_writelane_imm_i64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1332,7 +1344,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1100-GISEL-LABEL: test_writelane_imm_i64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1354,7 +1366,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1369,7 +1381,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1010-SDAG-LABEL: test_writelane_imm_f64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1383,7 +1395,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1100-SDAG-LABEL: test_writelane_imm_f64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1399,7 +1411,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX802-GISEL-LABEL: test_writelane_imm_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1414,7 +1426,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1010-GISEL-LABEL: test_writelane_imm_f64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1428,7 +1440,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1100-GISEL-LABEL: test_writelane_imm_f64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1450,10 +1462,10 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s6 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0 @@ -1464,11 +1476,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm @@ -1476,8 +1488,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x8 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 @@ -1489,10 +1501,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0 @@ -1503,11 +1515,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm @@ -1515,8 +1527,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x8 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -1533,12 +1545,12 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18 -; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1550,30 +1562,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x2 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s5, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s4, s8 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x2 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x18 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s1, s2 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1581,13 +1593,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18 -; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 @@ -1598,30 +1610,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x2 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s4, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s5, s8 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x2 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x18 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, s2 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s1, s2 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1634,12 +1646,12 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18 -; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1651,30 +1663,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x2 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s5, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s4, s8 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x2 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x18 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s1, s2 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1682,13 +1694,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18 -; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 @@ -1699,30 +1711,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x2 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s4, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s5, s8 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x2 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x18 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, s2 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s1, s2 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1735,7 +1747,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 @@ -1747,7 +1759,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1757,7 +1769,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1769,7 +1781,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 @@ -1781,7 +1793,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1791,7 +1803,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1808,8 +1820,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1824,22 +1836,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s4 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1853,12 +1865,12 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 @@ -1869,22 +1881,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s4 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -1903,8 +1915,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1919,22 +1931,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s4 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1948,12 +1960,12 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 @@ -1964,22 +1976,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s4 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index eeddb3d5b8192..5cf457d1753b3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) define amdgpu_kernel void @ceil_f16( ; SI-LABEL: ceil_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @ceil_f16( ; ; VI-LABEL: ceil_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @ceil_f16( ; ; GFX11-LABEL: ceil_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -68,7 +68,7 @@ define amdgpu_kernel void @ceil_f16( ; ; GFX11-FAKE16-LABEL: ceil_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -104,7 +104,7 @@ entry: define amdgpu_kernel void @ceil_v2f16( ; SI-LABEL: ceil_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; VI-LABEL: ceil_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -150,7 +150,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; GFX11-LABEL: ceil_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -179,7 +179,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; GFX11-FAKE16-LABEL: ceil_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index fcc4cb3436fd7..5514efa6838e7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: cos_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -46,7 +46,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: cos_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -58,7 +58,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: cos_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -70,7 +70,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: cos_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -91,7 +91,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -121,7 +121,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: cos_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -142,7 +142,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: cos_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -158,7 +158,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: cos_v2f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +174,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: cos_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 3a867879bb809..142145098df87 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -12,33 +12,34 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s0, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -46,33 +47,34 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -80,16 +82,16 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -97,36 +99,36 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -134,10 +136,10 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 @@ -162,29 +164,29 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -336,7 +338,7 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -388,7 +390,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -440,7 +442,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 @@ -479,7 +481,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-GISEL-LABEL: s_exp_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -518,7 +520,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -560,7 +562,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -851,25 +853,25 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 @@ -915,6 +917,7 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 @@ -923,19 +926,19 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 @@ -987,6 +990,7 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -995,11 +999,11 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 @@ -1048,10 +1052,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-GISEL-LABEL: s_exp_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 @@ -1101,10 +1105,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 @@ -1156,10 +1160,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 @@ -1590,26 +1594,26 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 @@ -1673,6 +1677,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 @@ -1681,29 +1686,28 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 @@ -1719,7 +1723,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 ; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 @@ -1744,6 +1748,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 @@ -1764,6 +1769,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1772,11 +1778,11 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1787,8 +1793,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 @@ -1833,17 +1839,16 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc -; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 @@ -1905,11 +1910,11 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1921,7 +1926,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 @@ -1967,17 +1972,16 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index a162949587481..4d981d27c309e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -14,33 +14,34 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s0, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -48,33 +49,34 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -82,16 +84,16 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -99,36 +101,36 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -136,10 +138,10 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 @@ -164,29 +166,29 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -338,7 +340,7 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -390,7 +392,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -442,7 +444,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 @@ -481,7 +483,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -520,7 +522,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -562,7 +564,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -853,25 +855,25 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 @@ -917,6 +919,7 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 @@ -925,19 +928,19 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 @@ -989,6 +992,7 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -997,11 +1001,11 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 @@ -1050,10 +1054,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 @@ -1103,10 +1107,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 @@ -1158,10 +1162,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 @@ -1592,26 +1596,26 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 @@ -1675,6 +1679,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 @@ -1683,29 +1688,28 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 @@ -1721,7 +1725,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 ; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 @@ -1746,6 +1750,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 @@ -1766,6 +1771,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1774,11 +1780,11 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1789,8 +1795,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 @@ -1835,17 +1841,16 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc -; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 @@ -1907,11 +1912,11 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1923,7 +1928,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 @@ -1969,17 +1974,16 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 36e78975cdb01..9f80e66e8f873 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -12,17 +12,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_exp2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 @@ -31,35 +31,35 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -69,14 +69,14 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -101,24 +101,25 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -172,7 +173,7 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -198,7 +199,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -222,7 +223,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 @@ -246,7 +247,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -270,7 +271,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 @@ -293,7 +294,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -380,8 +381,8 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -412,8 +413,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 @@ -444,8 +445,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -475,11 +476,11 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -506,8 +507,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -531,16 +532,16 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -655,45 +656,45 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v6, s2, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v8, s1, v8 -; SI-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v6, s6, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v8, s5, v8 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_exp_f32_e32 v8, v8 ; SI-SDAG-NEXT: v_exp_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -729,8 +730,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -766,8 +767,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -803,8 +804,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -834,13 +835,13 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -870,7 +871,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll index e8d037c5ff53e..ece55c7f7dcea 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.floor.v2f16(<2 x half> %a) define amdgpu_kernel void @floor_f16( ; SI-LABEL: floor_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @floor_f16( ; ; VI-LABEL: floor_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @floor_f16( ; ; GFX11-LABEL: floor_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -68,7 +68,7 @@ define amdgpu_kernel void @floor_f16( ; ; GFX11-FAKE16-LABEL: floor_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -105,7 +105,7 @@ entry: define amdgpu_kernel void @floor_v2f16( ; SI-LABEL: floor_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; VI-LABEL: floor_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -151,7 +151,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; GFX11-LABEL: floor_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; GFX11-FAKE16-LABEL: floor_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index a2e30603b6afc..edcdd323cb0ae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -14,7 +14,7 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> define amdgpu_kernel void @fmuladd_f16( ; SI-LABEL: fmuladd_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -48,7 +48,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; VI-FLUSH-LABEL: fmuladd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -76,7 +76,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; VI-DENORM-LABEL: fmuladd_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 ; VI-DENORM-NEXT: s_mov_b32 s10, -1 ; VI-DENORM-NEXT: s_mov_b32 s14, s10 @@ -104,7 +104,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -134,7 +134,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 @@ -162,7 +162,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX11-FLUSH-LABEL: fmuladd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -195,7 +195,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX11-DENORM-LABEL: fmuladd_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 @@ -237,131 +237,131 @@ define amdgpu_kernel void @fmuladd_f16( define amdgpu_kernel void @fmuladd_f16_imm_a( ; SI-LABEL: fmuladd_f16_imm_a: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-FLUSH-LABEL: fmuladd_f16_imm_a: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 -; VI-FLUSH-NEXT: s_mov_b32 s2, -1 -; VI-FLUSH-NEXT: s_mov_b32 s14, s2 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s10, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s10 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_mov_b32 s12, s6 ; VI-FLUSH-NEXT: s_mov_b32 s13, s7 -; VI-FLUSH-NEXT: s_mov_b32 s15, s3 -; VI-FLUSH-NEXT: s_mov_b32 s10, s2 -; VI-FLUSH-NEXT: s_mov_b32 s11, s3 +; VI-FLUSH-NEXT: s_mov_b32 s15, s11 +; VI-FLUSH-NEXT: s_mov_b32 s2, s10 +; VI-FLUSH-NEXT: s_mov_b32 s3, s11 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s0, s4 -; VI-FLUSH-NEXT: s_mov_b32 s1, s5 +; VI-FLUSH-NEXT: s_mov_b32 s8, s4 +; VI-FLUSH-NEXT: s_mov_b32 s9, s5 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1 -; VI-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-FLUSH-NEXT: s_endpgm ; ; VI-DENORM-LABEL: fmuladd_f16_imm_a: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000 -; VI-DENORM-NEXT: s_mov_b32 s2, -1 -; VI-DENORM-NEXT: s_mov_b32 s14, s2 +; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s10, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s10 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_mov_b32 s12, s6 ; VI-DENORM-NEXT: s_mov_b32 s13, s7 -; VI-DENORM-NEXT: s_mov_b32 s15, s3 -; VI-DENORM-NEXT: s_mov_b32 s10, s2 -; VI-DENORM-NEXT: s_mov_b32 s11, s3 +; VI-DENORM-NEXT: s_mov_b32 s15, s11 +; VI-DENORM-NEXT: s_mov_b32 s2, s10 +; VI-DENORM-NEXT: s_mov_b32 s3, s11 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: s_mov_b32 s0, s4 -; VI-DENORM-NEXT: s_movk_i32 s4, 0x4200 -; VI-DENORM-NEXT: s_mov_b32 s1, s5 -; VI-DENORM-NEXT: v_fma_f16 v0, v0, s4, v1 -; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-DENORM-NEXT: s_movk_i32 s0, 0x4200 +; VI-DENORM-NEXT: s_mov_b32 s8, s4 +; VI-DENORM-NEXT: s_mov_b32 s9, s5 +; VI-DENORM-NEXT: v_fma_f16 v0, v0, s0, v1 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-DENORM-NEXT: s_endpgm ; ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_a: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 -; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 +; GFX10-FLUSH-NEXT: s_mov_b32 s2, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s3, s11 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 +; GFX10-FLUSH-NEXT: s_mov_b32 s8, s4 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s9, s5 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16_imm_a: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 -; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 -; GFX10-DENORM-NEXT: s_mov_b32 s10, s2 -; GFX10-DENORM-NEXT: s_mov_b32 s11, s3 +; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 +; GFX10-DENORM-NEXT: s_mov_b32 s2, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s3, s11 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s8, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s9, s5 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1 -; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_a: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -388,8 +388,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; GFX11-DENORM-LABEL: fmuladd_f16_imm_a: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 @@ -423,131 +423,131 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( define amdgpu_kernel void @fmuladd_f16_imm_b( ; SI-LABEL: fmuladd_f16_imm_b: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-FLUSH-LABEL: fmuladd_f16_imm_b: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 -; VI-FLUSH-NEXT: s_mov_b32 s2, -1 -; VI-FLUSH-NEXT: s_mov_b32 s14, s2 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s10, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s10 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_mov_b32 s12, s6 ; VI-FLUSH-NEXT: s_mov_b32 s13, s7 -; VI-FLUSH-NEXT: s_mov_b32 s15, s3 -; VI-FLUSH-NEXT: s_mov_b32 s10, s2 -; VI-FLUSH-NEXT: s_mov_b32 s11, s3 +; VI-FLUSH-NEXT: s_mov_b32 s15, s11 +; VI-FLUSH-NEXT: s_mov_b32 s2, s10 +; VI-FLUSH-NEXT: s_mov_b32 s3, s11 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s0, s4 -; VI-FLUSH-NEXT: s_mov_b32 s1, s5 +; VI-FLUSH-NEXT: s_mov_b32 s8, s4 +; VI-FLUSH-NEXT: s_mov_b32 s9, s5 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1 -; VI-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-FLUSH-NEXT: s_endpgm ; ; VI-DENORM-LABEL: fmuladd_f16_imm_b: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000 -; VI-DENORM-NEXT: s_mov_b32 s2, -1 -; VI-DENORM-NEXT: s_mov_b32 s14, s2 +; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s10, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s10 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_mov_b32 s12, s6 ; VI-DENORM-NEXT: s_mov_b32 s13, s7 -; VI-DENORM-NEXT: s_mov_b32 s15, s3 -; VI-DENORM-NEXT: s_mov_b32 s10, s2 -; VI-DENORM-NEXT: s_mov_b32 s11, s3 +; VI-DENORM-NEXT: s_mov_b32 s15, s11 +; VI-DENORM-NEXT: s_mov_b32 s2, s10 +; VI-DENORM-NEXT: s_mov_b32 s3, s11 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: s_mov_b32 s0, s4 -; VI-DENORM-NEXT: s_movk_i32 s4, 0x4200 -; VI-DENORM-NEXT: s_mov_b32 s1, s5 -; VI-DENORM-NEXT: v_fma_f16 v0, v0, s4, v1 -; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-DENORM-NEXT: s_movk_i32 s0, 0x4200 +; VI-DENORM-NEXT: s_mov_b32 s8, s4 +; VI-DENORM-NEXT: s_mov_b32 s9, s5 +; VI-DENORM-NEXT: v_fma_f16 v0, v0, s0, v1 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-DENORM-NEXT: s_endpgm ; ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_b: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 -; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 +; GFX10-FLUSH-NEXT: s_mov_b32 s2, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s3, s11 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 +; GFX10-FLUSH-NEXT: s_mov_b32 s8, s4 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s9, s5 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16_imm_b: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 -; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 -; GFX10-DENORM-NEXT: s_mov_b32 s10, s2 -; GFX10-DENORM-NEXT: s_mov_b32 s11, s3 +; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 +; GFX10-DENORM-NEXT: s_mov_b32 s2, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s3, s11 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s8, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s9, s5 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1 -; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_b: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -574,8 +574,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; GFX11-DENORM-LABEL: fmuladd_f16_imm_b: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 @@ -609,7 +609,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( define amdgpu_kernel void @fmuladd_v2f16( ; SI-LABEL: fmuladd_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -653,7 +653,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; VI-FLUSH-LABEL: fmuladd_v2f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -686,7 +686,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; VI-DENORM-LABEL: fmuladd_v2f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 ; VI-DENORM-NEXT: s_mov_b32 s10, -1 ; VI-DENORM-NEXT: s_mov_b32 s14, s10 @@ -722,7 +722,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX10-FLUSH-LABEL: fmuladd_v2f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -752,7 +752,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX10-DENORM-LABEL: fmuladd_v2f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 @@ -780,7 +780,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX11-FLUSH-LABEL: fmuladd_v2f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -813,7 +813,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX11-DENORM-LABEL: fmuladd_v2f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll index aca7d3c720ceb..2bb4cc617e7f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll @@ -83,7 +83,7 @@ define i32 @strictfp_func_fpmode_i32() strictfp { define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; GFX6-LABEL: kernel_fpmode_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19) ; GFX6-NEXT: s_and_b32 s4, 0x7f3ff, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -95,7 +95,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX7-LABEL: kernel_fpmode_i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19) ; GFX7-NEXT: s_and_b32 s4, 0x7f3ff, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX8-LABEL: kernel_fpmode_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 19) ; GFX8-NEXT: s_and_b32 s2, 0x7f3ff, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -119,7 +119,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX9-LABEL: kernel_fpmode_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX9-NEXT: s_and_b32 s2, 0x87f3ff, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -130,7 +130,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX10-LABEL: kernel_fpmode_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_and_b32 s2, 0x87f3ff, s2 @@ -141,7 +141,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX11-LABEL: kernel_fpmode_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX11-NEXT: s_and_b32 s2, 0x87f3ff, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index ea823f30f26c2..2e8049e9765e1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -14,8 +14,8 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX7CHECK-LABEL: sgpr_isnan_bf16: ; GFX7CHECK: ; %bb.0: -; GFX7CHECK-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX7CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7CHECK-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX7CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7CHECK-NEXT: s_mov_b32 s3, 0xf000 ; GFX7CHECK-NEXT: s_mov_b32 s2, -1 ; GFX7CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -28,13 +28,13 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; ; GFX8CHECK-LABEL: sgpr_isnan_bf16: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 0x7fff -; GFX8CHECK-NEXT: s_movk_i32 s3, 0x7f80 +; GFX8CHECK-NEXT: s_movk_i32 s2, 0x7f80 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s3, v0 +; GFX8CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s2, v0 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -43,26 +43,26 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_bf16: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x7fff -; GFX9CHECK-NEXT: s_movk_i32 s0, 0x7f80 +; GFX9CHECK-NEXT: s_movk_i32 s2, 0x7f80 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX9CHECK-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s0, v1 +; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s2, v1 ; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_bf16: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v1, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2 +; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX10CHECK-NEXT: global_store_dword v1, v0, s[0:1] @@ -71,11 +71,11 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX11CHECK-LABEL: sgpr_isnan_bf16: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v1, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2 +; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4 ; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11CHECK-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index da64c379672ef..9c248bd6e8b2a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -13,8 +13,8 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f16: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7SELDAG-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s3, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s2, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -27,11 +27,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f16: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dword s3, s[0:1], 0xb -; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7GLISEL-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7GLISEL-NEXT: s_mov_b32 s2, -1 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX7GLISEL-NEXT: s_and_b32 s3, s3, 0x7fff +; GFX7GLISEL-NEXT: s_and_b32 s3, s4, 0x7fff ; GFX7GLISEL-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX7GLISEL-NEXT: s_cmpk_gt_u32 s3, 0x7c00 ; GFX7GLISEL-NEXT: s_cselect_b32 s3, 1, 0 @@ -43,10 +43,10 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX8CHECK-LABEL: sgpr_isnan_f16: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3 +; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s4, 3 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -55,23 +55,23 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f16: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[0:1], s4, 3 -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s4, 3 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_f16: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 +; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s4, 3 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm @@ -79,11 +79,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX11CHECK-LABEL: sgpr_isnan_f16: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 +; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s4, 3 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11CHECK-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll index 347e549e7cf56..a807885e0d853 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll @@ -13,8 +13,8 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f32: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7SELDAG-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s3, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s2, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -25,22 +25,22 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f32: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dword s3, s[0:1], 0xb -; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7GLISEL-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7GLISEL-NEXT: s_mov_b32 s2, -1 +; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX7GLISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s3, 3 +; GFX7GLISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s4, 3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7GLISEL-NEXT: s_endpgm ; ; GFX8CHECK-LABEL: sgpr_isnan_f32: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3 +; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 3 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -49,23 +49,23 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f32: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 3 -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 3 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_f32: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s4, 3 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm @@ -73,11 +73,11 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX11CHECK-LABEL: sgpr_isnan_f32: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3 +; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s4, 3 ; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] @@ -93,7 +93,7 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f64: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7SELDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s6, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -106,7 +106,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f64: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7GLISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX7GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] @@ -117,7 +117,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX8SELDAG-LABEL: sgpr_isnan_f64: ; GFX8SELDAG: ; %bb.0: -; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -128,7 +128,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX8GLISEL-LABEL: sgpr_isnan_f64: ; GFX8GLISEL: ; %bb.0: -; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8GLISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -139,7 +139,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f64: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 @@ -149,7 +149,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX10CHECK-LABEL: sgpr_isnan_f64: ; GFX10CHECK: ; %bb.0: -; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3 @@ -159,7 +159,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX11CHECK-LABEL: sgpr_isnan_f64: ; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index d847af780acab..c2f6fbfe4667c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 @@ -42,15 +42,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -70,15 +70,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -94,7 +94,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -102,15 +101,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 @@ -126,7 +125,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -134,17 +132,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 ; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 @@ -156,20 +154,20 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -183,19 +181,18 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -207,8 +204,9 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s4 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -216,14 +214,13 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -233,10 +230,11 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s4 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -318,7 +316,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf @@ -359,7 +357,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -398,7 +396,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -445,7 +443,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -492,7 +490,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 @@ -530,7 +528,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -568,7 +566,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-SDAG-LABEL: s_log_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -603,7 +601,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-GISEL-LABEL: s_log_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -749,8 +747,8 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -802,8 +800,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -855,7 +853,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 @@ -864,7 +862,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 @@ -921,8 +919,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -986,8 +984,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1037,8 +1035,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -1089,19 +1087,19 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-LABEL: s_log_v3f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -1122,7 +1120,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1145,19 +1143,19 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 @@ -1178,7 +1176,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1355,8 +1353,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3377d1cf @@ -1419,8 +1417,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1483,8 +1481,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1565,8 +1563,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1647,8 +1645,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3377d1cf @@ -1710,8 +1708,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1774,32 +1772,32 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-SDAG-LABEL: s_log_v4f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 @@ -1835,32 +1833,32 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 3f060de9f6596..0a1f7ab6fc0ae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 @@ -42,15 +42,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -70,15 +70,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -94,7 +94,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -102,15 +101,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 @@ -126,7 +125,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -134,17 +132,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 ; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 @@ -156,20 +154,20 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -183,19 +181,18 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log10_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -207,8 +204,9 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s4 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -216,14 +214,13 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log10_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -233,10 +230,11 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s4 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -318,7 +316,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf @@ -359,7 +357,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -398,7 +396,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -445,7 +443,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -492,7 +490,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a @@ -530,7 +528,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -568,7 +566,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log10_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -603,7 +601,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log10_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -749,8 +747,8 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -802,8 +800,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -855,7 +853,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 @@ -864,7 +862,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 @@ -921,8 +919,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -986,8 +984,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1037,8 +1035,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -1089,19 +1087,19 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-LABEL: s_log10_v3f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -1122,7 +1120,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1145,19 +1143,19 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 @@ -1178,7 +1176,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1355,8 +1353,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3284fbcf @@ -1419,8 +1417,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1483,8 +1481,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1565,8 +1563,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1647,8 +1645,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3284fbcf @@ -1710,8 +1708,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1774,32 +1772,32 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-LABEL: s_log10_v4f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 @@ -1835,32 +1833,32 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 035b2439eff15..7ca04cc235605 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 @@ -33,35 +33,35 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -71,14 +71,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -103,43 +103,44 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -147,19 +148,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log2_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s4 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -215,7 +216,7 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -241,7 +242,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -265,7 +266,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 @@ -289,7 +290,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -313,7 +314,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 @@ -336,7 +337,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_log2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -359,7 +360,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 @@ -384,7 +385,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log2_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -472,8 +473,8 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -504,8 +505,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 @@ -536,8 +537,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -567,11 +568,11 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -598,8 +599,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -623,16 +624,16 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -658,32 +659,32 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v3f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s5 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s5, v4 :: v_dual_mul_f32 v5, s4, v5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v5, s4, v5 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1 -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -692,21 +693,21 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 @@ -813,45 +814,45 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s2, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v8, s1, v8 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 +; SI-SDAG-NEXT: v_mul_f32_e32 v8, s5, v8 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_log_f32_e32 v8, v8 ; SI-SDAG-NEXT: v_log_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -887,8 +888,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -924,8 +925,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -961,8 +962,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -992,13 +993,13 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -1028,42 +1029,41 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_v4f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1072,32 +1072,32 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s10 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s11 ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index fa7ee9e8d28ff..5d3a5800bcdd8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -425,8 +425,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX7-LABEL: s_maximum_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -442,10 +442,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX8-LABEL: s_maximum_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_max_f16_e32 v1, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_max_f16_e32 v1, s6, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: ;;#ASMSTART @@ -456,10 +456,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX9-LABEL: s_maximum_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_max_f16_e32 v1, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_max_f16_e32 v1, s6, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ;;#ASMSTART @@ -485,8 +485,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX10-LABEL: s_maximum_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e64 v0, s4, s5 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 +; GFX10-NEXT: v_max_f16_e64 v0, s6, s7 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ;;#ASMSTART @@ -870,10 +870,10 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, s4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -897,16 +897,16 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-LABEL: s_maximum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s6, s5, 16 -; GFX8-NEXT: s_lshr_b32 s7, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_max_f16_e32 v1, s7, v0 +; GFX8-NEXT: s_lshr_b32 s4, s7, 16 +; GFX8-NEXT: s_lshr_b32 s5, s6, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_max_f16_e32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_max_f16_e32 v3, s4, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_max_f16_e32 v3, s6, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -918,17 +918,17 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX9-LABEL: s_maximum_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: v_pk_max_f16 v1, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_lshr_b32 s4, s7, 16 +; GFX9-NEXT: v_pk_max_f16 v1, s6, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 -; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX9-NEXT: s_lshr_b32 s5, s6, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v3 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -963,13 +963,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s4, s5 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 -; GFX10-NEXT: s_lshr_b32 s6, s5, 16 -; GFX10-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-NEXT: v_pk_max_f16 v0, s6, s7 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 +; GFX10-NEXT: s_lshr_b32 s4, s7, 16 +; GFX10-NEXT: s_lshr_b32 s5, s6, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s6 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index f4aa40dbd9bcd..e6655aeab7e9b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -401,10 +401,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX7-LABEL: s_maximum_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_max_f32_e32 v1, s4, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_max_f32_e32 v1, s6, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v0 @@ -414,10 +414,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX8-LABEL: s_maximum_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_max_f32_e32 v1, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_max_f32_e32 v1, s6, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -427,10 +427,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX9-LABEL: s_maximum_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_max_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_max_f32_e32 v1, s6, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 @@ -454,8 +454,8 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX10-LABEL: s_maximum_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, s4, s5 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s5 +; GFX10-NEXT: v_max_f32_e64 v0, s6, s7 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 @@ -781,14 +781,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_max_f32_e32 v1, s5, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s17 +; GFX7-NEXT: v_max_f32_e32 v1, s7, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v3, s4, v0 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX7-NEXT: v_max_f32_e32 v3, s6, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v[0:1] @@ -798,14 +798,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-LABEL: s_maximum_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_max_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NEXT: v_max_f32_e32 v1, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_max_f32_e32 v3, s4, v0 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX8-NEXT: v_max_f32_e32 v3, s6, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] @@ -815,14 +815,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX9-LABEL: s_maximum_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_max_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 +; GFX9-NEXT: v_max_f32_e32 v1, s7, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_max_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_max_f32_e32 v3, s6, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] @@ -850,11 +850,11 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, s5, s7 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s5, s7 -; GFX10-NEXT: v_max_f32_e64 v2, s4, s6 +; GFX10-NEXT: v_max_f32_e64 v0, s7, s17 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s7, s17 +; GFX10-NEXT: v_max_f32_e64 v2, s6, s16 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s6 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index e9acbec33f2f3..9a83c04cad1e3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -427,10 +427,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX7-LABEL: s_maximum_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_max_f64 v[2:3], s[4:5], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -442,10 +442,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX8-LABEL: s_maximum_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_max_f64 v[2:3], s[4:5], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -457,10 +457,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX9-LABEL: s_maximum_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_max_f64 v[2:3], s[4:5], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -487,8 +487,8 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX10-LABEL: s_maximum_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], s[4:5], s[6:7] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], s[6:7], s[16:17] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[6:7], s[16:17] ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 ; GFX10-NEXT: ;;#ASMSTART @@ -844,14 +844,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX7-LABEL: s_maximum_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] -; GFX7-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] -; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-NEXT: v_mov_b32_e32 v5, s19 +; GFX7-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -865,14 +865,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-LABEL: s_maximum_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 +; GFX8-NEXT: v_mov_b32_e32 v1, s21 +; GFX8-NEXT: v_mov_b32_e32 v5, s19 +; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -886,14 +886,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX9-LABEL: s_maximum_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] -; GFX9-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -907,11 +907,11 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX940-LABEL: s_maximum_v2f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GFX940-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1] ; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1] ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -927,14 +927,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX10-LABEL: s_maximum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], s[6:7], s[10:11] -; GFX10-NEXT: v_cmp_u_f64_e64 s6, s[6:7], s[10:11] -; GFX10-NEXT: v_max_f64 v[4:5], s[4:5], s[8:9] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[8:9] -; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s4 +; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[20:21] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21] +; GFX10-NEXT: v_max_f64 v[4:5], s[6:7], s[18:19] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[6:7], s[18:19] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s5 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:3] ; GFX10-NEXT: ;;#ASMEND @@ -943,10 +943,10 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX11-LABEL: s_maximum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[6:7] -; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[4:5] -; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[16:17] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[16:17] +; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 @@ -964,8 +964,8 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[6:7] -; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[4:5] +; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[16:17] +; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[6:7] ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use v[0:3] ; GFX12-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index d056a97dc5444..c7913f638798a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -13,111 +13,111 @@ declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) define amdgpu_kernel void @maxnum_f16( ; SI-LABEL: maxnum_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s3 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s2 -; GFX10-NEXT: s_mov_b32 s15, s3 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s10 +; GFX10-NEXT: s_mov_b32 s15, s11 +; GFX10-NEXT: s_mov_b32 s2, s10 +; GFX10-NEXT: s_mov_b32 s3, s11 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_mov_b32 s9, s5 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -155,7 +155,7 @@ entry: define amdgpu_kernel void @maxnum_f16_imm_a( ; SI-LABEL: maxnum_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -175,7 +175,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; VI-LABEL: maxnum_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -194,7 +194,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX9-LABEL: maxnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -213,7 +213,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX10-LABEL: maxnum_f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -232,7 +232,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX11-LABEL: maxnum_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -263,7 +263,7 @@ entry: define amdgpu_kernel void @maxnum_f16_imm_b( ; SI-LABEL: maxnum_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -283,7 +283,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; VI-LABEL: maxnum_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -302,7 +302,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX9-LABEL: maxnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -321,7 +321,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX10-LABEL: maxnum_f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -340,7 +340,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX11-LABEL: maxnum_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -371,8 +371,8 @@ entry: define amdgpu_kernel void @maxnum_v2f16( ; SI-LABEL: maxnum_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -396,8 +396,8 @@ define amdgpu_kernel void @maxnum_v2f16( ; ; VI-LABEL: maxnum_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -420,18 +420,18 @@ define amdgpu_kernel void @maxnum_v2f16( ; ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v1, s9, s9 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -439,16 +439,16 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX10-LABEL: maxnum_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 -; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 ; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -456,8 +456,8 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX11-LABEL: maxnum_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -486,7 +486,7 @@ entry: define amdgpu_kernel void @maxnum_v2f16_imm_a( ; SI-LABEL: maxnum_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -506,7 +506,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; VI-LABEL: maxnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -524,7 +524,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX9-LABEL: maxnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -538,7 +538,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX10-LABEL: maxnum_v2f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -551,7 +551,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX11-LABEL: maxnum_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -576,7 +576,7 @@ entry: define amdgpu_kernel void @maxnum_v2f16_imm_b( ; SI-LABEL: maxnum_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -596,7 +596,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; VI-LABEL: maxnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -614,7 +614,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX9-LABEL: maxnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -628,7 +628,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX10-LABEL: maxnum_v2f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -641,7 +641,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX11-LABEL: maxnum_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -667,8 +667,8 @@ entry: define amdgpu_kernel void @maxnum_v3f16( ; SI-LABEL: maxnum_v3f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -697,8 +697,8 @@ define amdgpu_kernel void @maxnum_v3f16( ; ; VI-LABEL: maxnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -725,21 +725,21 @@ define amdgpu_kernel void @maxnum_v3f16( ; ; GFX9-LABEL: maxnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 -; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v2, s9, s9 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -748,17 +748,17 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX10-LABEL: maxnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 ; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 @@ -769,8 +769,8 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX11-LABEL: maxnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -804,28 +804,28 @@ entry: define amdgpu_kernel void @maxnum_v4f16( ; SI-LABEL: maxnum_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: s_lshr_b32 s2, s3, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; SI-NEXT: s_lshr_b32 s2, s1, 16 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s1 ; SI-NEXT: v_max_f32_e32 v3, v3, v5 ; SI-NEXT: v_max_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -838,13 +838,13 @@ define amdgpu_kernel void @maxnum_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -876,21 +876,21 @@ define amdgpu_kernel void @maxnum_v4f16( ; ; GFX9-LABEL: maxnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v2, s8, s8 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -898,17 +898,17 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX10-LABEL: maxnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 ; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 @@ -918,8 +918,8 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX11-LABEL: maxnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -951,7 +951,7 @@ entry: define amdgpu_kernel void @fmax_v4f16_imm_a( ; SI-LABEL: fmax_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -980,7 +980,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; VI-LABEL: fmax_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -1007,7 +1007,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX9-LABEL: fmax_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -1026,7 +1026,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX10-LABEL: fmax_v4f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1041,7 +1041,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX11-LABEL: fmax_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index e00ebff751c73..01effc24e741d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -351,10 +351,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX8-LABEL: s_minimum_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_min_f16_e32 v1, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_min_f16_e32 v1, s6, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: ;;#ASMSTART @@ -365,10 +365,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX9-LABEL: s_minimum_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_min_f16_e32 v1, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_min_f16_e32 v1, s6, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ;;#ASMSTART @@ -394,8 +394,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX10-LABEL: s_minimum_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f16_e64 v0, s4, s5 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 +; GFX10-NEXT: v_min_f16_e64 v0, s6, s7 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ;;#ASMSTART @@ -709,16 +709,16 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-LABEL: s_minimum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s6, s5, 16 -; GFX8-NEXT: s_lshr_b32 s7, s4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_min_f16_e32 v1, s7, v0 +; GFX8-NEXT: s_lshr_b32 s4, s7, 16 +; GFX8-NEXT: s_lshr_b32 s5, s6, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_min_f16_e32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_min_f16_e32 v3, s4, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_min_f16_e32 v3, s6, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -730,17 +730,17 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX9-LABEL: s_minimum_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: v_pk_min_f16 v1, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_lshr_b32 s4, s7, 16 +; GFX9-NEXT: v_pk_min_f16 v1, s6, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 -; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX9-NEXT: s_lshr_b32 s5, s6, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v3 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -775,13 +775,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v0, s4, s5 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 -; GFX10-NEXT: s_lshr_b32 s6, s5, 16 -; GFX10-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-NEXT: v_pk_min_f16 v0, s6, s7 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 +; GFX10-NEXT: s_lshr_b32 s4, s7, 16 +; GFX10-NEXT: s_lshr_b32 s5, s6, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s6 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index e056682051aa4..518fc27c23082 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -401,10 +401,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX7-LABEL: s_minimum_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_min_f32_e32 v1, s4, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_min_f32_e32 v1, s6, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v0 @@ -414,10 +414,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX8-LABEL: s_minimum_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_min_f32_e32 v1, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_min_f32_e32 v1, s6, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -427,10 +427,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX9-LABEL: s_minimum_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_min_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_min_f32_e32 v1, s6, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 @@ -454,8 +454,8 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX10-LABEL: s_minimum_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f32_e64 v0, s4, s5 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s5 +; GFX10-NEXT: v_min_f32_e64 v0, s6, s7 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 @@ -781,14 +781,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX7-LABEL: s_minimum_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_min_f32_e32 v1, s5, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s17 +; GFX7-NEXT: v_min_f32_e32 v1, s7, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v3, s4, v0 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX7-NEXT: v_min_f32_e32 v3, s6, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v[0:1] @@ -798,14 +798,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-LABEL: s_minimum_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_min_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NEXT: v_min_f32_e32 v1, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_min_f32_e32 v3, s4, v0 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX8-NEXT: v_min_f32_e32 v3, s6, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] @@ -815,14 +815,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX9-LABEL: s_minimum_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_min_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 +; GFX9-NEXT: v_min_f32_e32 v1, s7, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_min_f32_e32 v3, s4, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_min_f32_e32 v3, s6, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] @@ -850,11 +850,11 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f32_e64 v0, s5, s7 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s5, s7 -; GFX10-NEXT: v_min_f32_e64 v2, s4, s6 +; GFX10-NEXT: v_min_f32_e64 v0, s7, s17 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s7, s17 +; GFX10-NEXT: v_min_f32_e64 v2, s6, s16 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s6 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index d8462ec220244..81b892d424b46 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -427,10 +427,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX7-LABEL: s_minimum_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_min_f64 v[2:3], s[4:5], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -442,10 +442,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX8-LABEL: s_minimum_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_min_f64 v[2:3], s[4:5], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -457,10 +457,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX9-LABEL: s_minimum_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_min_f64 v[2:3], s[4:5], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -487,8 +487,8 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX10-LABEL: s_minimum_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f64 v[0:1], s[4:5], s[6:7] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[6:7] +; GFX10-NEXT: v_min_f64 v[0:1], s[6:7], s[16:17] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[6:7], s[16:17] ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 ; GFX10-NEXT: ;;#ASMSTART @@ -844,14 +844,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX7-LABEL: s_minimum_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] -; GFX7-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] -; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-NEXT: v_mov_b32_e32 v5, s19 +; GFX7-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -865,14 +865,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-LABEL: s_minimum_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 +; GFX8-NEXT: v_mov_b32_e32 v1, s21 +; GFX8-NEXT: v_mov_b32_e32 v5, s19 +; GFX8-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -886,14 +886,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX9-LABEL: s_minimum_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] -; GFX9-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] +; GFX9-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -907,11 +907,11 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX940-LABEL: s_minimum_v2f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GFX940-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1] ; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1] ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -927,14 +927,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX10-LABEL: s_minimum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f64 v[0:1], s[6:7], s[10:11] -; GFX10-NEXT: v_cmp_u_f64_e64 s6, s[6:7], s[10:11] -; GFX10-NEXT: v_min_f64 v[4:5], s[4:5], s[8:9] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[8:9] -; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s4 +; GFX10-NEXT: v_min_f64 v[0:1], s[16:17], s[20:21] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21] +; GFX10-NEXT: v_min_f64 v[4:5], s[6:7], s[18:19] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[6:7], s[18:19] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s5 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:3] ; GFX10-NEXT: ;;#ASMEND @@ -943,10 +943,10 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX11-LABEL: s_minimum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[6:7] -; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[4:5] -; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[16:17] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[16:17] +; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 @@ -964,8 +964,8 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[6:7] -; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[4:5] +; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[16:17] +; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[6:7] ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use v[0:3] ; GFX12-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index f934a2de9247f..0a004fd7701cf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -13,111 +13,111 @@ declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) define amdgpu_kernel void @minnum_f16_ieee( ; SI-LABEL: minnum_f16_ieee: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_f16_ieee: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s3 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s2 -; GFX10-NEXT: s_mov_b32 s15, s3 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s10 +; GFX10-NEXT: s_mov_b32 s15, s11 +; GFX10-NEXT: s_mov_b32 s2, s10 +; GFX10-NEXT: s_mov_b32 s3, s11 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_mov_b32 s9, s5 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -182,7 +182,7 @@ define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { define amdgpu_kernel void @minnum_f16_imm_a( ; SI-LABEL: minnum_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -202,7 +202,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; VI-LABEL: minnum_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -221,7 +221,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX9-LABEL: minnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -240,7 +240,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX10-LABEL: minnum_f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -259,7 +259,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX11-LABEL: minnum_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -289,7 +289,7 @@ entry: define amdgpu_kernel void @minnum_f16_imm_b( ; SI-LABEL: minnum_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -309,7 +309,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; VI-LABEL: minnum_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -328,7 +328,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX9-LABEL: minnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -347,7 +347,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX10-LABEL: minnum_f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -366,7 +366,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX11-LABEL: minnum_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -396,8 +396,8 @@ entry: define amdgpu_kernel void @minnum_v2f16_ieee( ; SI-LABEL: minnum_v2f16_ieee: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -421,8 +421,8 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; ; VI-LABEL: minnum_v2f16_ieee: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -445,18 +445,18 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v1, s9, s9 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -464,16 +464,16 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX10-LABEL: minnum_v2f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 -; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 ; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -481,8 +481,8 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX11-LABEL: minnum_v2f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -545,7 +545,7 @@ define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) define amdgpu_kernel void @minnum_v2f16_imm_a( ; SI-LABEL: minnum_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -565,7 +565,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; VI-LABEL: minnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -583,7 +583,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX9-LABEL: minnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -597,7 +597,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX10-LABEL: minnum_v2f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -610,7 +610,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX11-LABEL: minnum_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -634,7 +634,7 @@ entry: define amdgpu_kernel void @minnum_v2f16_imm_b( ; SI-LABEL: minnum_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -654,7 +654,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; VI-LABEL: minnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -672,7 +672,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX9-LABEL: minnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -686,7 +686,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX10-LABEL: minnum_v2f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -699,7 +699,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX11-LABEL: minnum_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -724,8 +724,8 @@ entry: define amdgpu_kernel void @minnum_v3f16( ; SI-LABEL: minnum_v3f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -754,8 +754,8 @@ define amdgpu_kernel void @minnum_v3f16( ; ; VI-LABEL: minnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -782,21 +782,21 @@ define amdgpu_kernel void @minnum_v3f16( ; ; GFX9-LABEL: minnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 -; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v2, s9, s9 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -805,17 +805,17 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX10-LABEL: minnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 @@ -826,8 +826,8 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX11-LABEL: minnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -860,28 +860,28 @@ entry: define amdgpu_kernel void @minnum_v4f16( ; SI-LABEL: minnum_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: s_lshr_b32 s2, s3, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; SI-NEXT: s_lshr_b32 s2, s1, 16 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s1 ; SI-NEXT: v_min_f32_e32 v3, v3, v5 ; SI-NEXT: v_min_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -894,13 +894,13 @@ define amdgpu_kernel void @minnum_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -932,21 +932,21 @@ define amdgpu_kernel void @minnum_v4f16( ; ; GFX9-LABEL: minnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v2, s8, s8 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -954,17 +954,17 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX10-LABEL: minnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 @@ -974,8 +974,8 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX11-LABEL: minnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -1006,7 +1006,7 @@ entry: define amdgpu_kernel void @fmin_v4f16_imm_a( ; SI-LABEL: fmin_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; VI-LABEL: fmin_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX9-LABEL: fmin_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -1081,7 +1081,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX10-LABEL: fmin_v4f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX11-LABEL: fmin_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index c3e665fa8269a..53ea253035655 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -332,7 +332,7 @@ bb: define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; SI-LABEL: umulo_i64_s: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 @@ -365,7 +365,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX9-LABEL: umulo_i64_s: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s0, s3 ; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -394,7 +394,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX10-LABEL: umulo_i64_s: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mul_i32 s7, s0, s3 ; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -423,7 +423,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX11-LABEL: umulo_i64_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s7, s0, s3 ; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -454,7 +454,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX12-LABEL: umulo_i64_s: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 @@ -491,7 +491,7 @@ bb: define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; SI-LABEL: smulo_i64_s: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 @@ -540,7 +540,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX9-LABEL: smulo_i64_s: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s0, s3 ; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -581,7 +581,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX10-LABEL: smulo_i64_s: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mul_i32 s7, s0, s3 ; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -622,7 +622,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX11-LABEL: smulo_i64_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s7, s0, s3 ; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -667,7 +667,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX12-LABEL: smulo_i64_s: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll index 826862e124920..3d73f84b6e9a8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -17,12 +17,12 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_x: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x18 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -45,8 +45,8 @@ entry: define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -56,12 +56,12 @@ define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_y: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x1c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -84,8 +84,8 @@ entry: define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -95,12 +95,12 @@ define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_z: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -123,8 +123,8 @@ entry: define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xy: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s4, s4, s5 @@ -135,13 +135,13 @@ define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xy: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -166,12 +166,12 @@ entry: define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0x6 -; SI-NEXT: s_load_dword s4, s[0:1], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x6 +; SI-NEXT: s_load_dword s5, s[2:3], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s2, s4 +; SI-NEXT: s_mul_i32 s4, s4, s5 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -179,11 +179,11 @@ define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x18 -; VI-NEXT: s_load_dword s3, s[0:1], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x18 +; VI-NEXT: s_load_dword s5, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: s_mul_i32 s2, s4, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -211,7 +211,7 @@ entry: define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_yz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x7 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s0, s0, s1 @@ -224,7 +224,7 @@ define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_yz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x1c +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mul_i32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -254,13 +254,13 @@ entry: define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xyz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 -; SI-NEXT: s_load_dword s2, s[0:1], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 +; SI-NEXT: s_load_dword s6, s[2:3], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: s_add_i32 s4, s4, s2 +; SI-NEXT: s_mul_i32 s2, s4, s5 +; SI-NEXT: s_add_i32 s4, s2, s6 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -268,15 +268,15 @@ define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xyz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 -; VI-NEXT: s_load_dword s4, s[0:1], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 +; VI-NEXT: s_load_dword s4, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s2, s2, s3 -; VI-NEXT: s_add_i32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: s_add_i32 s0, s0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -304,8 +304,8 @@ entry: define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -315,12 +315,12 @@ define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_x_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x18 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -345,8 +345,8 @@ entry: define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -356,12 +356,12 @@ define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_y_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x1c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -386,8 +386,8 @@ entry: define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -397,12 +397,12 @@ define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_z_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 84afa3b0096ea..47dd0263d020e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) define amdgpu_kernel void @rint_f16( ; SI-LABEL: rint_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @rint_f16( ; ; GFX89-LABEL: rint_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @rint_f16( ; ; GFX11-LABEL: rint_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -85,7 +85,7 @@ entry: define amdgpu_kernel void @rint_v2f16( ; SI-LABEL: rint_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -111,7 +111,7 @@ define amdgpu_kernel void @rint_v2f16( ; ; VI-LABEL: rint_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @rint_v2f16( ; ; GFX9-LABEL: rint_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -151,7 +151,7 @@ define amdgpu_kernel void @rint_v2f16( ; ; GFX11-LABEL: rint_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index ddbc5ef4e5b60..fc962b1b4a377 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-LABEL: round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s5, 0xfffff ; SI-NEXT: s_mov_b32 s4, s6 @@ -41,7 +41,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; ; CI-LABEL: round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_brev_b32 s5, -2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 @@ -68,7 +68,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -108,7 +108,7 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -141,64 +141,65 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) #0 { ; SI-LABEL: round_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s9, 0xfffff -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s1, 0xfffff +; SI-NEXT: s_mov_b32 s0, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], s3 -; SI-NEXT: s_and_b32 s12, s7, 0x80000000 -; SI-NEXT: s_andn2_b64 s[10:11], s[6:7], s[10:11] -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s10, 0, s10 -; SI-NEXT: s_cselect_b32 s11, s12, s11 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s10, s6, s10 -; SI-NEXT: s_cselect_b32 s11, s7, s11 -; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 +; SI-NEXT: s_add_i32 s12, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s12 +; SI-NEXT: s_and_b32 s7, s11, 0x80000000 +; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] +; SI-NEXT: s_cmp_lt_i32 s12, 0 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s7, s5 +; SI-NEXT: s_cmp_gt_i32 s12, 51 +; SI-NEXT: s_cselect_b32 s12, s10, s4 +; SI-NEXT: s_cselect_b32 s13, s11, s5 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 +; SI-NEXT: s_brev_b32 s7, -2 +; SI-NEXT: s_and_b64 s[2:3], s[14:15], exec +; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s2, 0xfc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] +; SI-NEXT: s_and_b32 s3, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s2, 0 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s3, s1 +; SI-NEXT: s_cmp_gt_i32 s2, 51 +; SI-NEXT: s_cselect_b32 s1, s9, s1 +; SI-NEXT: s_cselect_b32 s0, s8, s0 +; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_cmp_ge_f64_e64 s[12:13], |v[0:1]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], s3 -; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_and_b32 s8, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s8, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s6, s4, s6 -; SI-NEXT: s_cselect_b32 s7, s5, s7 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_add_f64 v[2:3], s[4:5], -v[2:3] -; SI-NEXT: s_brev_b32 s12, -2 -; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5 -; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 -; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[2:3]|, 0.5 +; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_add_f64 v[2:3], s[10:11], v[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfi_b32 v1, s12, v1, v4 -; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: v_add_f64 v[2:3], s[12:13], v[0:1] +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_bfi_b32 v1, s7, v1, v4 +; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 @@ -232,151 +233,151 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) #0 { ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s13, 0xfffff -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_brev_b32 s18, -2 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s1, 0xfffff +; SI-NEXT: s_mov_b32 s0, s14 +; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], s3 -; SI-NEXT: s_and_b32 s16, s7, 0x80000000 -; SI-NEXT: s_andn2_b64 s[14:15], s[6:7], s[14:15] -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s14, 0, s14 -; SI-NEXT: s_cselect_b32 s15, s16, s15 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s14, s6, s14 -; SI-NEXT: s_cselect_b32 s15, s7, s15 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: s_bfe_u32 s12, s7, 0xb0014 +; SI-NEXT: s_add_i32 s16, s12, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[12:13], s[0:1], s16 +; SI-NEXT: s_and_b32 s15, s7, 0x80000000 +; SI-NEXT: s_andn2_b64 s[12:13], s[6:7], s[12:13] +; SI-NEXT: s_cmp_lt_i32 s16, 0 +; SI-NEXT: s_cselect_b32 s12, 0, s12 +; SI-NEXT: s_cselect_b32 s13, s15, s13 +; SI-NEXT: s_cmp_gt_i32 s16, 51 +; SI-NEXT: s_cselect_b32 s16, s6, s12 +; SI-NEXT: s_cselect_b32 s17, s7, s13 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: v_mov_b32_e32 v4, 0 -; SI-NEXT: v_cmp_ge_f64_e64 s[16:17], |v[0:1]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b64 s[16:17], s[16:17], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], s3 -; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_and_b32 s16, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s16, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s6, s4, s6 -; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 -; SI-NEXT: s_cselect_b32 s7, s5, s7 -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: v_cmp_ge_f64_e64 s[18:19], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b64 s[2:3], s[18:19], exec +; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[2:3], s[0:1], s6 +; SI-NEXT: s_andn2_b64 s[2:3], s[4:5], s[2:3] +; SI-NEXT: s_and_b32 s7, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s6, 0 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s7, s3 +; SI-NEXT: s_cmp_gt_i32 s6, 51 +; SI-NEXT: s_brev_b32 s15, -2 +; SI-NEXT: s_cselect_b32 s2, s4, s2 +; SI-NEXT: v_bfi_b32 v5, s15, v0, v1 +; SI-NEXT: s_cselect_b32 s3, s5, s3 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[14:15], v[4:5] -; SI-NEXT: v_cmp_ge_f64_e64 s[16:17], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[2:3], s[16:17], v[4:5] +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: s_and_b64 s[14:15], s[16:17], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], s3 +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 +; SI-NEXT: s_add_i32 s6, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_and_b32 s14, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s7, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s14, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s5, s7, s5 +; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: s_cselect_b32 s4, s10, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: v_bfi_b32 v5, s18, v5, v6 -; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 -; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[4:5] -; SI-NEXT: s_and_b64 s[6:7], s[14:15], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v8, s3 -; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], s3 -; SI-NEXT: s_andn2_b64 s[6:7], s[8:9], s[6:7] -; SI-NEXT: s_and_b32 s10, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s10, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s6, s8, s6 -; SI-NEXT: s_cselect_b32 s7, s9, s7 -; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_bfi_b32 v5, s15, v5, v6 +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] +; SI-NEXT: s_and_b64 s[2:3], s[6:7], exec +; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v8, s2 +; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s2, 0xfc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] +; SI-NEXT: s_and_b32 s3, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s2, 0 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s3, s1 +; SI-NEXT: s_cmp_gt_i32 s2, 51 +; SI-NEXT: s_cselect_b32 s1, s9, s1 +; SI-NEXT: s_cselect_b32 s0, s8, s0 +; SI-NEXT: v_mov_b32_e32 v6, s1 +; SI-NEXT: v_mov_b32_e32 v5, s0 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], -v[5:6] ; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[6:7]|, 0.5 -; SI-NEXT: v_bfi_b32 v5, s18, v8, v9 +; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s15, v8, v9 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[4:5] -; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: v_mov_b32_e32 v8, s9 -; SI-NEXT: v_bfi_b32 v5, s18, v5, v8 -; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[4:5] -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_bfi_b32 v5, s15, v5, v8 +; SI-NEXT: v_add_f64 v[4:5], s[0:1], v[4:5] +; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; CI-NEXT: s_brev_b32 s14, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; CI-NEXT: s_mov_b32 s15, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_mov_b32_e32 v5, s7 -; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] -; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 ; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] -; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s14, v8, v5 ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] -; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v10, s5 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s14, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[10:11] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[6:7]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v12, s11 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_bfi_b32 v5, s14, v5, v12 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v8, s9 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v8 +; CI-NEXT: v_bfi_b32 v5, s14, v5, v8 ; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s14, -1 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; CI-NEXT: s_endpgm %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, ptr addrspace(1) %out @@ -386,124 +387,125 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) #0 { ; SI-LABEL: round_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s21, 0xfffff -; SI-NEXT: s_mov_b32 s20, s2 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_mov_b32 s1, 0xfffff +; SI-NEXT: s_mov_b32 s0, s22 ; SI-NEXT: v_mov_b32_e32 v8, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], s3 -; SI-NEXT: s_and_b32 s24, s7, 0x80000000 -; SI-NEXT: s_andn2_b64 s[22:23], s[6:7], s[22:23] -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s22, 0, s22 -; SI-NEXT: s_cselect_b32 s23, s24, s23 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s22, s6, s22 -; SI-NEXT: s_cselect_b32 s23, s7, s23 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_mov_b32_e32 v1, s23 +; SI-NEXT: s_bfe_u32 s20, s7, 0xb0014 +; SI-NEXT: s_add_i32 s24, s20, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[20:21], s[0:1], s24 +; SI-NEXT: s_and_b32 s23, s7, 0x80000000 +; SI-NEXT: s_andn2_b64 s[20:21], s[6:7], s[20:21] +; SI-NEXT: s_cmp_lt_i32 s24, 0 +; SI-NEXT: s_cselect_b32 s20, 0, s20 +; SI-NEXT: s_cselect_b32 s21, s23, s21 +; SI-NEXT: s_cmp_gt_i32 s24, 51 +; SI-NEXT: s_cselect_b32 s24, s6, s20 +; SI-NEXT: s_cselect_b32 s25, s7, s21 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_mov_b32_e32 v1, s25 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_brev_b32 s3, -2 -; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 +; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b64 s[24:25], s[24:25], exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 -; SI-NEXT: s_add_i32 s24, s6, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s24 +; SI-NEXT: s_and_b64 s[2:3], s[26:27], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_and_b32 s25, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s24, 0 +; SI-NEXT: s_and_b32 s23, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s25, s7 -; SI-NEXT: s_cmp_gt_i32 s24, 51 +; SI-NEXT: s_cselect_b32 s7, s23, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: s_cselect_b32 s6, s4, s6 -; SI-NEXT: v_bfi_b32 v9, s3, v0, v1 +; SI-NEXT: v_bfi_b32 v9, s2, v0, v1 ; SI-NEXT: s_cselect_b32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[22:23], v[8:9] -; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[2:3], s[24:25], v[8:9] +; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: s_and_b64 s[22:23], s[24:25], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 -; SI-NEXT: s_add_i32 s22, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s22 +; SI-NEXT: s_and_b64 s[24:25], s[26:27], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] ; SI-NEXT: s_and_b32 s23, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s22, 0 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s23, s5 -; SI-NEXT: s_cmp_gt_i32 s22, 51 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s4, s10, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 -; SI-NEXT: v_cmp_ge_f64_e64 s[22:23], |v[0:1]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s2, v4, v5 +; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[8:9] -; SI-NEXT: s_and_b64 s[6:7], s[22:23], exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v6, s6 -; SI-NEXT: s_bfe_u32 s6, s9, 0xb0014 -; SI-NEXT: s_add_i32 s10, s6, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s10 -; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: s_and_b64 s[6:7], s[24:25], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v6, s3 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 ; SI-NEXT: s_andn2_b64 s[6:7], s[8:9], s[6:7] -; SI-NEXT: s_and_b32 s11, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_and_b32 s10, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s11, s7 -; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s7, s10, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s6, s8, s6 ; SI-NEXT: s_cselect_b32 s7, s9, s7 ; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[4:5] -; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s2, v6, v7 ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[8:9] ; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_bfe_u32 s4, s15, 0xb0014 -; SI-NEXT: s_add_i32 s8, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s8 -; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s3 +; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[4:5] -; SI-NEXT: s_and_b32 s9, s15, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_and_b32 s8, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s9, s5 -; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s5, s8, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s4, s14, s4 ; SI-NEXT: s_cselect_b32 s5, s15, s5 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: v_add_f64 v[4:5], s[14:15], -v[4:5] -; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 +; SI-NEXT: v_mov_b32_e32 v10, s9 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[4:5]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s2, v9, v10 ; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[8:9] ; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v12, s6 -; SI-NEXT: s_bfe_u32 s6, s13, 0xb0014 -; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s8 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v12, s3 +; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 ; SI-NEXT: s_andn2_b64 s[6:7], s[12:13], s[6:7] -; SI-NEXT: s_and_b32 s9, s13, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_and_b32 s8, s13, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s9, s7 -; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s7, s8, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s7, s13, s7 ; SI-NEXT: s_cselect_b32 s6, s12, s6 ; SI-NEXT: v_mov_b32_e32 v10, s7 @@ -511,20 +513,20 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v13, s15 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 +; SI-NEXT: v_bfi_b32 v9, s2, v12, v13 ; SI-NEXT: v_add_f64 v[12:13], s[4:5], v[8:9] ; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_bfe_u32 s4, s19, 0xb0014 -; SI-NEXT: s_add_i32 s8, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s8 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v14, s3 +; SI-NEXT: s_bfe_u32 s3, s19, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[18:19], s[4:5] -; SI-NEXT: s_and_b32 s9, s19, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_and_b32 s8, s19, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s9, s5 -; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s5, s8, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s5, s19, s5 ; SI-NEXT: s_cselect_b32 s4, s18, s4 ; SI-NEXT: v_mov_b32_e32 v10, s5 @@ -532,129 +534,128 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v15, s13 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 +; SI-NEXT: v_bfi_b32 v9, s2, v14, v15 ; SI-NEXT: v_add_f64 v[10:11], s[6:7], v[8:9] ; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: s_bfe_u32 s6, s17, 0xb0014 -; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s8 -; SI-NEXT: s_andn2_b64 s[6:7], s[16:17], s[6:7] -; SI-NEXT: s_and_b32 s9, s17, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 -; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s9, s7 -; SI-NEXT: s_cmp_gt_i32 s8, 51 -; SI-NEXT: s_cselect_b32 s7, s17, s7 -; SI-NEXT: s_cselect_b32 s6, s16, s6 -; SI-NEXT: v_mov_b32_e32 v15, s7 -; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s3 +; SI-NEXT: s_bfe_u32 s3, s17, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 +; SI-NEXT: s_andn2_b64 s[0:1], s[16:17], s[0:1] +; SI-NEXT: s_and_b32 s6, s17, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s6, s1 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s1, s17, s1 +; SI-NEXT: s_cselect_b32 s0, s16, s0 +; SI-NEXT: v_mov_b32_e32 v15, s1 +; SI-NEXT: v_mov_b32_e32 v14, s0 ; SI-NEXT: v_add_f64 v[14:15], s[16:17], -v[14:15] ; SI-NEXT: v_mov_b32_e32 v16, s19 -; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[14:15]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s2, v9, v16 ; SI-NEXT: v_add_f64 v[16:17], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b64 s[4:5], s[6:7], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s3 ; SI-NEXT: v_mov_b32_e32 v14, s17 -; SI-NEXT: v_bfi_b32 v9, s3, v9, v14 -; SI-NEXT: v_add_f64 v[14:15], s[6:7], v[8:9] -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_bfi_b32 v9, s2, v9, v14 +; SI-NEXT: v_add_f64 v[14:15], s[0:1], v[8:9] +; SI-NEXT: s_mov_b32 s23, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; CI-NEXT: s_brev_b32 s22, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; CI-NEXT: s_mov_b32 s23, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] ; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s7 -; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 ; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] -; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 -; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s22, v8, v5 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v10, s5 ; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] -; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s22, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[10:11], s[8:9], -v[6:7] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15] ; CI-NEXT: v_mov_b32_e32 v12, s11 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_bfi_b32 v5, s22, v5, v12 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], s[14:15], -v[10:11] ; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v14, s9 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v14 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s22, v5, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[12:13] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[12:13], s[12:13], -v[14:15] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v16, s15 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v16 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s22, v5, v16 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v18, s13 ; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[16:17] -; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s22, v5, v18 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 ; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[18:19], s[16:17], -v[14:15] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[18:19]|, 0.5 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v20, s19 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v5, s2, v5, v20 -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_bfi_b32 v5, s22, v5, v20 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v18, s17 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 +; CI-NEXT: v_bfi_b32 v5, s22, v5, v18 ; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s22, -1 +; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index 7ad7cc821c1b5..d5b4f879bf8a0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-LABEL: round_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -24,57 +24,39 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX8-LABEL: round_f32: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f32_e32 v0, s6 -; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: round_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f32_e32 v0, s2 -; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: round_f32: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_trunc_f32_e32 v0, s6 +; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX89-NEXT: s_brev_b32 s4, -2 +; GFX89-NEXT: v_mov_b32_e32 v2, s6 +; GFX89-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v0, s2 +; GFX11-NEXT: v_trunc_f32_e32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0 -; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5 +; GFX11-NEXT: v_sub_f32_e32 v1, s4, v0 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -109,7 +91,7 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #0 { ; GFX6-LABEL: round_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -135,7 +117,7 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX89-LABEL: round_v2f32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_brev_b32 s8, -2 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 @@ -161,7 +143,7 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX11-LABEL: round_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s3 ; GFX11-NEXT: v_trunc_f32_e32 v2, s2 @@ -216,8 +198,8 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #0 { ; GFX6-LABEL: round_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_brev_b32 s10, -2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -253,89 +235,50 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX8-LABEL: round_v4f32: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_brev_b32 s10, -2 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f32_e32 v0, s7 -; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s6 -; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s5 -; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4 -; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s4 -; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: round_v4f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-NEXT: s_brev_b32 s2, -2 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f32_e32 v0, s7 -; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s6 -; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s5 -; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4 -; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s4 -; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: round_v4f32: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX89-NEXT: s_brev_b32 s10, -2 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_trunc_f32_e32 v0, s7 +; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX89-NEXT: v_mov_b32_e32 v2, s7 +; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s6 +; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX89-NEXT: v_mov_b32_e32 v2, s6 +; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s5 +; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX89-NEXT: v_mov_b32_e32 v4, s5 +; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v4 +; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s4 +; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX89-NEXT: v_mov_b32_e32 v5, s4 +; GFX89-NEXT: v_bfi_b32 v4, s10, v4, v5 +; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s7 @@ -412,145 +355,145 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #0 { ; GFX6-LABEL: round_v8f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; GFX6-NEXT: s_brev_b32 s14, -2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GFX6-NEXT: s_brev_b32 s2, -2 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_trunc_f32_e32 v0, s7 ; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s6 ; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s5 ; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v4 +; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v4 ; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s4 ; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_bfi_b32 v4, s14, v4, v5 +; GFX6-NEXT: v_bfi_b32 v4, s2, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, s11 ; GFX6-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v6, s11 -; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v7, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s10 ; GFX6-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v6, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s9 ; GFX6-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s9 -; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v8 +; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v8 ; GFX6-NEXT: v_add_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s8 ; GFX6-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v9, s8 -; GFX6-NEXT: v_bfi_b32 v8, s14, v8, v9 +; GFX6-NEXT: v_bfi_b32 v8, s2, v8, v9 ; GFX6-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; ; GFX89-LABEL: round_v8f32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GFX89-NEXT: s_brev_b32 s14, -2 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GFX89-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 +; GFX89-NEXT: s_brev_b32 s2, -2 +; GFX89-NEXT: s_mov_b32 s15, 0xf000 +; GFX89-NEXT: s_mov_b32 s14, -1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_trunc_f32_e32 v0, s7 ; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s6 ; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s5 ; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v4, s5 -; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v4 +; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4 ; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s4 ; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v5, s4 -; GFX89-NEXT: v_bfi_b32 v4, s14, v4, v5 +; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5 ; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX89-NEXT: v_trunc_f32_e32 v4, s11 ; GFX89-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v6, s11 -; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 ; GFX89-NEXT: v_add_f32_e32 v7, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s10 ; GFX89-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v6, s10 -; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 ; GFX89-NEXT: v_add_f32_e32 v6, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s9 ; GFX89-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v8, s9 -; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v8 +; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v8 ; GFX89-NEXT: v_add_f32_e32 v5, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s8 ; GFX89-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] ; GFX89-NEXT: v_mov_b32_e32 v9, s8 -; GFX89-NEXT: v_bfi_b32 v8, s14, v8, v9 +; GFX89-NEXT: v_bfi_b32 v8, s2, v8, v9 ; GFX89-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_v8f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s7 @@ -685,10 +628,10 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-LABEL: round_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_trunc_f32_e32 v1, v0 ; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5 @@ -699,62 +642,44 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX8-LABEL: round_f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f16_e32 v1, s4 -; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1 -; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_f16_e32 v0, v1, v0 -; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: round_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f16_e32 v1, s2 -; GFX9-NEXT: v_sub_f16_e32 v2, s2, v1 -; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v2 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_add_f16_e32 v0, v1, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: round_f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX89-NEXT: s_movk_i32 s5, 0x7fff +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_trunc_f16_e32 v1, s4 +; GFX89-NEXT: v_sub_f16_e32 v2, s4, v1 +; GFX89-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX89-NEXT: v_mov_b32_e32 v2, s4 +; GFX89-NEXT: v_bfi_b32 v0, s5, v0, v2 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f16_e32 v0, s2 +; GFX11-NEXT: v_trunc_f16_e32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0 -; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5 +; GFX11-NEXT: v_sub_f16_e32 v1, s4, v0 +; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -798,13 +723,13 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-LABEL: round_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb ; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: v_trunc_f32_e32 v3, v1 ; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v0 @@ -823,13 +748,14 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: round_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX8-NEXT: s_movk_i32 s6, 0x7fff ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -856,57 +782,57 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; ; GFX9-LABEL: round_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: v_trunc_f16_e32 v1, s0 -; GFX9-NEXT: v_sub_f16_e32 v2, s0, v1 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: v_trunc_f16_e32 v1, s5 +; GFX9-NEXT: v_sub_f16_e32 v2, s5, v1 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_bfi_b32 v2, s1, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_bfi_b32 v2, s6, v2, v3 ; GFX9-NEXT: v_add_f16_e32 v1, v1, v2 -; GFX9-NEXT: v_trunc_f16_e32 v2, s2 -; GFX9-NEXT: v_sub_f16_e32 v3, s2, v2 +; GFX9-NEXT: v_trunc_f16_e32 v2, s4 +; GFX9-NEXT: v_sub_f16_e32 v3, s4, v2 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_bfi_b32 v0, s1, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_bfi_b32 v0, s6, v0, v3 ; GFX9-NEXT: v_add_f16_e32 v0, v2, v0 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_trunc_f16_e32 v1, s2 -; GFX11-NEXT: v_trunc_f16_e32 v0, s3 +; GFX11-NEXT: s_lshr_b32 s5, s4, 16 +; GFX11-NEXT: v_trunc_f16_e32 v1, s4 +; GFX11-NEXT: v_trunc_f16_e32 v0, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1 -; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0 +; GFX11-NEXT: v_sub_f16_e32 v3, s4, v1 +; GFX11-NEXT: v_sub_f16_e32 v2, s5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4 +; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v2|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3 +; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s4 ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll index 6a9c4c8d41c20..70f15bd0aa613 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -64,7 +64,7 @@ define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) { define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; GFX6-LABEL: s_set_rounding_kernel: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX6-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX6-NEXT: ;;#ASMSTART @@ -79,7 +79,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX7-LABEL: s_set_rounding_kernel: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x9 ; GFX7-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX7-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX7-NEXT: ;;#ASMSTART @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX8-LABEL: s_set_rounding_kernel: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX8-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX8-NEXT: ;;#ASMSTART @@ -109,7 +109,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX9-LABEL: s_set_rounding_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX9-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX9-NEXT: ;;#ASMSTART @@ -124,7 +124,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX10-LABEL: s_set_rounding_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX11-LABEL: s_set_rounding_kernel: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index 2ce0a628686ea..a70f4d8d90065 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: sin_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -46,7 +46,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: sin_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -58,7 +58,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: sin_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -70,7 +70,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: sin_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -91,7 +91,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -121,7 +121,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: sin_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -142,7 +142,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: sin_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -158,7 +158,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: sin_v2f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +174,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: sin_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll index f2d57ba902e73..c69ebedbec50b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -9,7 +9,7 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) define amdgpu_kernel void @sqrt_f16( ; SI-LABEL: sqrt_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @sqrt_f16( ; ; VI-LABEL: sqrt_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -47,7 +47,7 @@ define amdgpu_kernel void @sqrt_f16( ; ; GFX11-LABEL: sqrt_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -83,7 +83,7 @@ entry: define amdgpu_kernel void @sqrt_v2f16( ; SI-LABEL: sqrt_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -109,7 +109,7 @@ define amdgpu_kernel void @sqrt_v2f16( ; ; VI-LABEL: sqrt_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -129,7 +129,7 @@ define amdgpu_kernel void @sqrt_v2f16( ; ; GFX11-LABEL: sqrt_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index d1e2ddcdc6eac..11f5e6ebf9998 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -9,7 +9,7 @@ declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) define amdgpu_kernel void @trunc_f16( ; SI-LABEL: trunc_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @trunc_f16( ; ; VI-LABEL: trunc_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -47,7 +47,7 @@ define amdgpu_kernel void @trunc_f16( ; ; GFX11-LABEL: trunc_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -84,7 +84,7 @@ entry: define amdgpu_kernel void @trunc_v2f16( ; SI-LABEL: trunc_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @trunc_v2f16( ; ; VI-LABEL: trunc_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @trunc_v2f16( ; ; GFX11-LABEL: trunc_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll index 7c5ab1790c548..029c4e51e2993 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { ; GFX6-LABEL: constant_load_v8f32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s16, s[10:11], 0x0 ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur ; ; GFX12-LABEL: constant_load_v8f32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[8:9], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index cfaefca3a516d..7202ab8b31466 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_f64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-HSA-LABEL: constant_load_f64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-NOHSA-LABEL: constant_load_f64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -47,7 +47,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -68,7 +68,7 @@ attributes #0 = { nounwind } define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { ; GFX6-NOHSA-LABEL: constant_load_2v4f64: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[24:25], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -92,7 +92,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX7-HSA-LABEL: constant_load_2v4f64: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -114,7 +114,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX8-NOHSA-LABEL: constant_load_2v4f64: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX12-LABEL: constant_load_2v4f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[20:21], s[18:19], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 04fba9ef6d86d..7178eaf2e7384 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: constant_load_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -65,7 +65,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: constant_load_i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v2i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -101,7 +101,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v2i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -140,7 +140,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v2i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -157,7 +157,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v3i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -174,7 +174,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v3i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -212,7 +212,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v3i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -229,7 +229,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v4i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -246,7 +246,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v4i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -285,7 +285,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v4i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -302,7 +302,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v8i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -319,7 +319,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v8i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -358,7 +358,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v8i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -375,7 +375,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v16i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -392,7 +392,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v16i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -431,7 +431,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v16i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -448,7 +448,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v32i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -460,7 +460,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v32i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -488,7 +488,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v32i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -505,7 +505,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v64i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -518,7 +518,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v64i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -547,7 +547,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v64i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -565,7 +565,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_i1_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -582,7 +582,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_zextload_i1_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -611,7 +611,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i1_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -629,7 +629,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_i1_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -647,7 +647,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_sextload_i1_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -678,7 +678,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i1_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -698,7 +698,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -715,7 +715,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -744,7 +744,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -762,7 +762,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -780,7 +780,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -811,7 +811,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -831,7 +831,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -850,7 +850,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -884,7 +884,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] @@ -907,7 +907,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -926,7 +926,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -961,7 +961,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] @@ -983,7 +983,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1137,7 +1137,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1278,7 +1278,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1323,7 +1323,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] @@ -1350,7 +1350,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1376,7 +1376,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1443,7 +1443,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1507,7 +1507,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -1649,7 +1649,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1767,7 +1767,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] @@ -1827,7 +1827,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -1863,7 +1863,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1990,7 +1990,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] @@ -2043,7 +2043,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -2132,7 +2132,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2349,7 +2349,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2441,7 +2441,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -2530,7 +2530,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2770,7 +2770,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2859,7 +2859,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3025,7 +3025,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3614,7 +3614,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3780,7 +3780,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4241,7 +4241,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4400,7 +4400,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_i1_to_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4419,7 +4419,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_zextload_i1_to_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4451,7 +4451,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i1_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4471,7 +4471,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_i1_to_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4490,7 +4490,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_sextload_i1_to_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4523,7 +4523,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i1_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4543,7 +4543,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4562,7 +4562,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4594,7 +4594,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4614,7 +4614,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4633,7 +4633,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4666,7 +4666,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4686,7 +4686,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4707,7 +4707,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4745,7 +4745,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -4768,7 +4768,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4790,7 +4790,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4829,7 +4829,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] @@ -4854,7 +4854,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4878,7 +4878,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, v5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4931,7 +4931,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v5, s[2:3] @@ -4960,7 +4960,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4986,7 +4986,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5041,7 +5041,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v6, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v6, s[2:3] @@ -5072,7 +5072,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -5098,7 +5098,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5158,7 +5158,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -5192,7 +5192,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -5221,7 +5221,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5282,7 +5282,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -5317,7 +5317,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5353,7 +5353,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5453,7 +5453,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -5494,7 +5494,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5537,7 +5537,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5642,7 +5642,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v16, s[2:3] @@ -5687,7 +5687,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5746,7 +5746,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v6, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, v2 @@ -5930,7 +5930,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5997,7 +5997,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -6069,7 +6069,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -6263,7 +6263,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v32, s[2:3] @@ -6336,7 +6336,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -6443,7 +6443,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6779,7 +6779,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6898,7 +6898,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -7063,7 +7063,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -7444,7 +7444,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7574,7 +7574,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7777,7 +7777,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -8428,7 +8428,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8645,7 +8645,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -8968,7 +8968,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0 ; GFX8-NEXT: s_mov_b32 s13, s7 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -9715,7 +9715,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s19, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s5, s19 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index a015a39a7184f..355c296d122ff 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: constant_load_i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -38,7 +38,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: constant_load_i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -77,7 +77,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -95,7 +95,7 @@ entry: define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v2i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -119,7 +119,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v2i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -165,7 +165,7 @@ entry: define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v3i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -180,7 +180,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 @@ -198,7 +198,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v3i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 4 @@ -252,7 +252,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -273,7 +273,7 @@ entry: define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v4i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -286,7 +286,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -299,7 +299,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v4i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -328,7 +328,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -347,7 +347,7 @@ entry: define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v8i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -362,7 +362,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v8i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -408,7 +408,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -428,7 +428,7 @@ entry: define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v16i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 @@ -449,7 +449,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GCN-HSA-LABEL: constant_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -473,7 +473,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GCN-NOHSA-VI-LABEL: constant_load_v16i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s8, 16 @@ -522,7 +522,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -546,7 +546,7 @@ entry: define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #0 { ; GCN-NOHSA-SI-LABEL: constant_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -590,7 +590,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GCN-HSA-LABEL: constant_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 @@ -608,7 +608,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GCN-NOHSA-VI-LABEL: constant_load_v16i16_align2: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 14 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 @@ -742,7 +742,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GFX12-LABEL: constant_load_v16i16_align2: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0xf @@ -778,7 +778,7 @@ entry: define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -795,7 +795,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -808,7 +808,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -837,7 +837,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i16_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -855,7 +855,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -872,7 +872,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -885,7 +885,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -915,7 +915,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i16_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -933,7 +933,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -950,7 +950,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -963,7 +963,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -992,7 +992,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i16_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1010,7 +1010,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1040,7 +1040,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1070,7 +1070,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i16_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1103,7 +1103,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1118,7 +1118,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1175,7 +1175,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1205,7 +1205,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1240,7 +1240,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1262,7 +1262,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1281,7 +1281,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v3i16_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1361,7 +1361,7 @@ entry: define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1380,7 +1380,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 @@ -1440,7 +1440,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v3i16_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1465,7 +1465,7 @@ entry: define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1503,7 +1503,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i16_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1611,7 +1611,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1655,7 +1655,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i16_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1682,7 +1682,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1711,7 +1711,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i16_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1842,7 +1842,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1871,7 +1871,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1903,7 +1903,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1969,7 +1969,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i16_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2001,7 +2001,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -2050,7 +2050,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2108,7 +2108,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2219,7 +2219,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i16_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2265,7 +2265,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -2314,7 +2314,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2372,7 +2372,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2487,7 +2487,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i16_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2533,7 +2533,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -2622,7 +2622,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2732,7 +2732,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2937,7 +2937,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i16_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3013,7 +3013,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -3102,7 +3102,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3212,7 +3212,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3427,7 +3427,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i16_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3503,7 +3503,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 @@ -3672,7 +3672,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3888,7 +3888,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4290,7 +4290,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v64i16_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x0 @@ -4426,7 +4426,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x10 @@ -4595,7 +4595,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4811,7 +4811,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 @@ -5229,7 +5229,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v64i16_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x40 @@ -5365,7 +5365,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5383,7 +5383,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5397,7 +5397,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5429,7 +5429,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i16_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5453,7 +5453,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5471,7 +5471,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5485,7 +5485,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5519,7 +5519,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i16_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -5540,7 +5540,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5558,7 +5558,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5572,7 +5572,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5604,7 +5604,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i16_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5623,7 +5623,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5641,7 +5641,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5655,7 +5655,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5689,7 +5689,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i16_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -5710,7 +5710,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5727,7 +5727,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5744,7 +5744,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5782,7 +5782,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i16_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5805,7 +5805,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5823,7 +5823,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -5841,7 +5841,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -5882,7 +5882,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i16_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5905,7 +5905,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5928,7 +5928,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5954,7 +5954,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6009,7 +6009,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6037,7 +6037,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6064,7 +6064,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6094,7 +6094,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6156,7 +6156,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6187,7 +6187,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6222,7 +6222,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6266,7 +6266,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6357,7 +6357,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6394,7 +6394,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6438,7 +6438,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6492,7 +6492,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6598,7 +6598,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6642,7 +6642,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 @@ -6701,7 +6701,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6781,7 +6781,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6945,7 +6945,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7003,7 +7003,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -7081,7 +7081,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7187,7 +7187,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7381,7 +7381,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7452,7 +7452,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -7559,7 +7559,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7711,7 +7711,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -8026,7 +8026,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8124,7 +8124,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -8272,7 +8272,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8476,7 +8476,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -8854,7 +8854,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index b0d8f72c22ba7..f1a6bccc559f0 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-HSA-LABEL: constant_load_i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-NOHSA-LABEL: constant_load_i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-HSA-LABEL: constant_load_i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -73,7 +73,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -91,7 +91,7 @@ entry: define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v2i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -104,7 +104,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v2i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v2i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -146,7 +146,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v2i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -158,7 +158,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -177,7 +177,7 @@ entry: define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -193,7 +193,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -207,7 +207,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -242,7 +242,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v3i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -274,7 +274,7 @@ entry: define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v4i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -289,7 +289,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v4i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -304,7 +304,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v4i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -335,7 +335,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v4i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -349,7 +349,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -369,7 +369,7 @@ entry: define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v8i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -390,7 +390,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v8i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -414,7 +414,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v8i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 16 @@ -458,7 +458,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v8i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -477,7 +477,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -501,7 +501,7 @@ entry: define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v9i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s12, s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -526,7 +526,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v9i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -557,7 +557,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v9i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s12, s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -614,7 +614,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v9i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s12, s[10:11], 0x20 @@ -636,7 +636,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v9i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x20 @@ -663,7 +663,7 @@ entry: define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v10i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -689,7 +689,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v10i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -721,7 +721,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v10i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -780,7 +780,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v10i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x20 @@ -803,7 +803,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v10i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[12:13], s[10:11], 0x20 @@ -831,7 +831,7 @@ entry: define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v11i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -860,7 +860,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -893,7 +893,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v11i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -958,7 +958,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v11i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 @@ -982,7 +982,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v11i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20 @@ -1010,7 +1010,7 @@ entry: define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v12i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1038,7 +1038,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v12i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v12i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v12i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 @@ -1158,7 +1158,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v12i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[12:15], s[10:11], 0x20 @@ -1187,7 +1187,7 @@ entry: define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v16i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -1220,7 +1220,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v16i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48 @@ -1262,7 +1262,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v16i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_add_u32 s18, s16, 48 @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v16i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 @@ -1365,7 +1365,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1395,7 +1395,7 @@ entry: define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i32_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1421,7 +1421,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX8-NOHSA-LABEL: constant_zextload_i32_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1451,7 +1451,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX9-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1462,7 +1462,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i32_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i32_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1494,7 +1494,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1508,7 +1508,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX8-NOHSA-LABEL: constant_sextload_i32_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1539,7 +1539,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX9-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i32_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1586,7 +1586,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1599,7 +1599,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1629,7 +1629,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1640,7 +1640,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1658,7 +1658,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1717,7 +1717,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1730,7 +1730,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1751,7 +1751,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1766,7 +1766,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1781,7 +1781,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1816,7 +1816,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1849,7 +1849,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1866,7 +1866,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1884,7 +1884,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -1940,7 +1940,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1963,7 +1963,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1982,7 +1982,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2004,7 +2004,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2053,7 +2053,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2070,7 +2070,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2093,7 +2093,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2117,7 +2117,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2145,7 +2145,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2203,7 +2203,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -2226,7 +2226,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2254,7 +2254,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -2281,7 +2281,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2317,7 +2317,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2399,7 +2399,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2424,7 +2424,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2453,7 +2453,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -2491,7 +2491,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2543,7 +2543,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2644,7 +2644,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -2683,7 +2683,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2721,7 +2721,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -2788,7 +2788,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2888,7 +2888,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3075,7 +3075,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 @@ -3142,7 +3142,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3201,7 +3201,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -3244,7 +3244,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3314,7 +3314,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3462,7 +3462,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3503,7 +3503,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3544,7 +3544,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -3680,7 +3680,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4231,7 +4231,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4355,7 +4355,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 @@ -4459,7 +4459,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -4537,7 +4537,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4672,7 +4672,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4957,7 +4957,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5031,7 +5031,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -5098,7 +5098,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -5158,7 +5158,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -5241,7 +5241,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -5375,7 +5375,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v32i32: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -5426,7 +5426,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 66c73fda38743..46c7c2b08cd64 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-LABEL: constant_load_i64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: constant_load_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -63,7 +63,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -81,7 +81,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v2i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -96,7 +96,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v2i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v2i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 @@ -142,7 +142,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -162,7 +162,7 @@ entry: define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v3i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -182,7 +182,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v3i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -205,7 +205,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v3i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -253,7 +253,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10 @@ -278,7 +278,7 @@ entry: define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v4i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 @@ -299,7 +299,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v4i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_add_u32 s10, s8, 16 @@ -323,7 +323,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v4i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_add_u32 s10, s8, 16 @@ -372,7 +372,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -396,7 +396,7 @@ entry: define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v8i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 @@ -429,7 +429,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v8i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-NEXT: s_add_u32 s18, s16, 48 @@ -471,7 +471,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v8i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NEXT: s_add_u32 s18, s16, 48 @@ -558,7 +558,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -588,7 +588,7 @@ entry: define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v16i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NEXT: s_mov_b32 s39, 0xf000 @@ -648,7 +648,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -731,7 +731,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: constant_load_v16i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -899,7 +899,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 9000cee7ef9df..67a376b8c0f3c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX7-HSA-LABEL: constant_load_i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -39,7 +39,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-NOHSA-LABEL: constant_load_i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -78,7 +78,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: constant_load_i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -96,7 +96,7 @@ entry: define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v2i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -113,7 +113,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v2i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -126,7 +126,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v2i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -165,7 +165,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v2i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -183,7 +183,7 @@ entry: define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v3i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -198,7 +198,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v3i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -217,7 +217,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v3i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -278,7 +278,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v3i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -298,7 +298,7 @@ entry: define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v4i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -310,7 +310,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v4i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -322,7 +322,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v4i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -350,7 +350,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v4i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -368,7 +368,7 @@ entry: define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v8i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -381,7 +381,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v8i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -394,7 +394,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v8i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -423,7 +423,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v8i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -442,7 +442,7 @@ entry: define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v16i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -457,7 +457,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v16i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -472,7 +472,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v16i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -503,7 +503,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v16i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -523,7 +523,7 @@ entry: define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -540,7 +540,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -553,7 +553,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -582,7 +582,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -600,7 +600,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -617,7 +617,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -630,7 +630,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -660,7 +660,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -695,7 +695,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -708,7 +708,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -737,7 +737,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -755,7 +755,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -772,7 +772,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -785,7 +785,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -815,7 +815,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -834,7 +834,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -853,7 +853,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -868,7 +868,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -911,7 +911,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -934,7 +934,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -953,7 +953,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -968,7 +968,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1034,7 +1034,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1053,7 +1053,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1070,7 +1070,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1137,7 +1137,7 @@ entry: define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1156,7 +1156,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1242,7 +1242,7 @@ entry: define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1280,7 +1280,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1367,7 +1367,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1386,7 +1386,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1431,7 +1431,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1486,7 +1486,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1518,7 +1518,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1616,7 +1616,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1645,7 +1645,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1677,7 +1677,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 @@ -1747,7 +1747,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1782,7 +1782,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1831,7 +1831,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1889,7 +1889,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1999,7 +1999,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2046,7 +2046,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2095,7 +2095,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2153,7 +2153,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2275,7 +2275,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2326,7 +2326,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2415,7 +2415,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2525,7 +2525,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2724,7 +2724,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2802,7 +2802,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2891,7 +2891,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3001,7 +3001,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3231,7 +3231,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3315,7 +3315,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3483,7 +3483,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3699,7 +3699,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4086,7 +4086,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4222,7 +4222,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4390,7 +4390,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4604,7 +4604,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5047,7 +5047,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5195,7 +5195,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5213,7 +5213,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5227,7 +5227,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -5259,7 +5259,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -5279,7 +5279,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5297,7 +5297,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5311,7 +5311,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5345,7 +5345,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] @@ -5366,7 +5366,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5384,7 +5384,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5398,7 +5398,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5429,7 +5429,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5448,7 +5448,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5466,7 +5466,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5480,7 +5480,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5514,7 +5514,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] @@ -5535,7 +5535,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5556,7 +5556,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5573,7 +5573,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5620,7 +5620,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5642,7 +5642,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5664,7 +5664,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5682,7 +5682,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5731,7 +5731,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v4, s[2:3] @@ -5756,7 +5756,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -5779,7 +5779,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5805,7 +5805,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5863,7 +5863,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5892,7 +5892,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -5920,7 +5920,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5951,7 +5951,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6014,7 +6014,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6046,7 +6046,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6081,7 +6081,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6125,7 +6125,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6220,7 +6220,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6260,7 +6260,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6305,7 +6305,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6361,7 +6361,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_mov_b32 s5, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -6472,7 +6472,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -6518,7 +6518,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6577,7 +6577,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6657,7 +6657,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6826,7 +6826,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6887,7 +6887,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6967,7 +6967,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7074,7 +7074,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7275,7 +7275,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7347,7 +7347,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -7454,7 +7454,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7606,7 +7606,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7934,7 +7934,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8039,7 +8039,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8198,7 +8198,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8406,7 +8406,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8793,7 +8793,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8932,7 +8932,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -8949,7 +8949,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8962,7 +8962,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9000,7 +9000,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -9018,7 +9018,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9035,7 +9035,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9048,7 +9048,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9088,7 +9088,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] @@ -9106,7 +9106,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9123,7 +9123,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9136,7 +9136,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9174,7 +9174,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -9192,7 +9192,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9209,7 +9209,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9222,7 +9222,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9262,7 +9262,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] @@ -9280,7 +9280,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9300,7 +9300,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9316,7 +9316,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -9356,7 +9356,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -9379,7 +9379,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9401,7 +9401,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9419,7 +9419,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -9469,7 +9469,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -9492,7 +9492,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9512,7 +9512,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9532,7 +9532,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9598,7 +9598,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -9627,7 +9627,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9650,7 +9650,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9673,7 +9673,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9749,7 +9749,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -9777,7 +9777,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9806,7 +9806,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9835,7 +9835,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9940,7 +9940,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -9976,7 +9976,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10011,7 +10011,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10046,7 +10046,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10171,7 +10171,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10206,7 +10206,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10254,7 +10254,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10306,7 +10306,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10501,7 +10501,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10556,7 +10556,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10617,7 +10617,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10681,7 +10681,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10912,7 +10912,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10968,7 +10968,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -11054,7 +11054,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11152,7 +11152,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11523,7 +11523,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -11616,7 +11616,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -11729,7 +11729,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11851,7 +11851,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -12297,7 +12297,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 21e27bfa75531..142bc37fdeb75 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-HSA-LABEL: global_load_i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -40,7 +40,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-NOHSA-VI-LABEL: global_load_i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -115,7 +115,7 @@ entry: define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v2i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -132,7 +132,7 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -145,7 +145,7 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v2i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -200,7 +200,7 @@ entry: define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v3i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -218,7 +218,7 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -236,7 +236,7 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v3i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -332,7 +332,7 @@ entry: define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v4i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -349,7 +349,7 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -362,7 +362,7 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v4i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -417,7 +417,7 @@ entry: define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v8i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -434,7 +434,7 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -447,7 +447,7 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v8i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -502,7 +502,7 @@ entry: define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v16i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -522,7 +522,7 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; ; GCN-NOHSA-VI-LABEL: global_load_v16i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -622,7 +622,7 @@ entry: define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, s10 @@ -672,7 +672,7 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; ; GCN-HSA-LABEL: global_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -696,7 +696,7 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; ; GCN-NOHSA-VI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -804,7 +804,7 @@ entry: define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -821,7 +821,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -834,7 +834,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -889,7 +889,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -906,7 +906,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -919,7 +919,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -977,7 +977,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -994,7 +994,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1007,7 +1007,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1079,7 +1079,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1092,7 +1092,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1150,7 +1150,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1169,7 +1169,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1184,7 +1184,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1249,7 +1249,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1283,7 +1283,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1369,7 +1369,7 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1385,7 +1385,7 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1458,7 +1458,7 @@ entry: define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1495,7 +1495,7 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1575,7 +1575,7 @@ entry: define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1689,7 +1689,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1711,7 +1711,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1729,7 +1729,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1833,7 +1833,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1859,7 +1859,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1956,7 +1956,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1982,7 +1982,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2008,7 +2008,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -2108,7 +2108,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2146,7 +2146,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2196,7 +2196,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -2344,7 +2344,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2382,7 +2382,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2432,7 +2432,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -2591,7 +2591,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2653,7 +2653,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2751,7 +2751,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -3002,7 +3002,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -3064,7 +3064,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3162,7 +3162,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -3450,9 +3450,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 ; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -3579,7 +3579,7 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3772,14 +3772,14 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s9 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 @@ -4260,13 +4260,13 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s8, s8, s3 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 +; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4289,11 +4289,11 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 @@ -4370,17 +4370,17 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -4573,14 +4573,14 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s9 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 @@ -5126,7 +5126,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5144,7 +5144,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5158,7 +5158,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5223,7 +5223,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5241,7 +5241,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5255,7 +5255,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5318,7 +5318,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5336,7 +5336,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5350,7 +5350,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5410,7 +5410,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5428,7 +5428,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5442,7 +5442,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5505,7 +5505,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5526,7 +5526,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5543,7 +5543,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5613,7 +5613,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5635,7 +5635,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5653,7 +5653,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5727,7 +5727,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5753,7 +5753,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5779,7 +5779,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5871,7 +5871,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5898,7 +5898,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5925,7 +5925,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -6022,7 +6022,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6058,7 +6058,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 @@ -6102,7 +6102,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6240,7 +6240,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6277,7 +6277,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6322,7 +6322,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6469,7 +6469,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6527,7 +6527,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 @@ -6613,7 +6613,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6847,7 +6847,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6907,7 +6907,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6995,7 +6995,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -7247,9 +7247,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 ; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, 0 @@ -7374,7 +7374,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -7529,7 +7529,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -7965,7 +7965,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -8076,7 +8076,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8250,7 +8250,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 0f9cc33d731f1..c0649322c8195 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCNX3-HSA-LABEL: global_load_i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -39,7 +39,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCNX3-NOHSA-LABEL: global_load_i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-HSA-LABEL: global_load_i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v1, v0, s[2:3] @@ -88,7 +88,7 @@ entry: define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v2i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -105,7 +105,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v2i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -118,7 +118,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v2i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -151,7 +151,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v2i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -167,7 +167,7 @@ entry: define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v3i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -185,7 +185,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v3i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v3i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -236,7 +236,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3] @@ -252,7 +252,7 @@ entry: define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v4i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -269,7 +269,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v4i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v4i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -315,7 +315,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v4i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -331,7 +331,7 @@ entry: define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v8i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -351,7 +351,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v8i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -375,7 +375,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v8i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -415,7 +415,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v8i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 @@ -434,7 +434,7 @@ entry: define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v9i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -457,7 +457,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v9i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -492,7 +492,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v9i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -543,7 +543,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v9i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] @@ -565,7 +565,7 @@ entry: define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v10i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -588,7 +588,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v10i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -623,7 +623,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v10i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -672,7 +672,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v10i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v10, s[2:3] @@ -694,7 +694,7 @@ entry: define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v11i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -718,7 +718,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v11i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -753,7 +753,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v11i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -807,7 +807,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v11i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3] @@ -830,7 +830,7 @@ entry: define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v12i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -853,7 +853,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v12i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -888,7 +888,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v12i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -938,7 +938,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v12i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] @@ -960,7 +960,7 @@ entry: define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v16i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -986,7 +986,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v16i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v16i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1090,7 +1090,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v16i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] offset:32 @@ -1115,7 +1115,7 @@ entry: define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_i32_to_i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i32_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3] @@ -1198,7 +1198,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_i32_to_i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1230,7 +1230,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i32_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3] @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v1i32_to_v1i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1300,7 +1300,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1314,7 +1314,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1349,7 +1349,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3] @@ -1365,7 +1365,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v1i32_to_v1i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1383,7 +1383,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3] @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v2i32_to_v2i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1470,7 +1470,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1487,7 +1487,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1528,7 +1528,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] @@ -1547,7 +1547,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1567,7 +1567,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1625,7 +1625,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] @@ -1644,7 +1644,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v4i32_to_v4i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1669,7 +1669,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1694,7 +1694,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1769,7 +1769,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v4i32_to_v4i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1795,7 +1795,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1821,7 +1821,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1877,7 +1877,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3] @@ -1902,7 +1902,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1981,7 +1981,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2091,7 +2091,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -2129,7 +2129,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2179,7 +2179,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v23, s[2:3] @@ -2303,7 +2303,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2365,7 +2365,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2463,7 +2463,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2613,7 +2613,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32 @@ -2674,7 +2674,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v16i32_to_v16i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2726,7 +2726,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2941,7 +2941,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,9 +2995,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NOHSA-NEXT: s_mov_b32 s14, -1 ; SI-NOHSA-NEXT: s_mov_b32 s15, 0xe8f000 -; SI-NOHSA-NEXT: s_add_u32 s12, s12, s3 +; SI-NOHSA-NEXT: s_add_u32 s12, s12, s9 ; SI-NOHSA-NEXT: s_addc_u32 s13, s13, 0 -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -3112,7 +3112,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3305,7 +3305,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -3580,12 +3580,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCN-GFX900-HSA: ; %bb.0: -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[10:11], s[2:3] -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[8:9], s[0:1] -; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3] +; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-GFX900-HSA-NEXT: s_add_u32 s8, s8, s7 -; GCN-GFX900-HSA-NEXT: s_addc_u32 s9, s9, 0 +; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s13 +; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112 @@ -3611,11 +3611,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[8:11], 0 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v10 @@ -3654,11 +3654,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51 @@ -3695,7 +3695,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCN-GFX908-HSA: ; %bb.0: -; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 @@ -3811,7 +3811,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v32i32_to_v32i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0 @@ -3899,7 +3899,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4064,7 +4064,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -4303,7 +4303,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4389,7 +4389,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v32i32: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -4423,7 +4423,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v32i32: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4515,7 +4515,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v32i32: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -4603,7 +4603,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll index 26b559ae6fa9a..a71418f3dbf5b 100644 --- a/llvm/test/CodeGen/AMDGPU/local-64.ll +++ b/llvm/test/CodeGen/AMDGPU/local-64.ll @@ -9,7 +9,7 @@ ; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 ; GCN: buffer_store_dword [[REG]], -define amdgpu_kernel void @local_i32_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { +define amdgpu_kernel void @local_i32_load(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { %gep = getelementptr i32, ptr addrspace(3) %in, i32 7 %val = load i32, ptr addrspace(3) %gep, align 4 store i32 %val, ptr addrspace(1) %out, align 4 @@ -22,7 +22,7 @@ define amdgpu_kernel void @local_i32_load(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} ; GCN: buffer_store_dword [[REG]], -define amdgpu_kernel void @local_i32_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { +define amdgpu_kernel void @local_i32_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { %val = load i32, ptr addrspace(3) %in, align 4 store i32 %val, ptr addrspace(1) %out, align 4 ret void @@ -35,7 +35,7 @@ define amdgpu_kernel void @local_i32_load_0_offset(ptr addrspace(1) %out, ptr ad ; GCN-NOT: add ; GCN: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 ; GCN: buffer_store_byte [[REG]], -define amdgpu_kernel void @local_i8_load_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { +define amdgpu_kernel void @local_i8_load_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { %gep = getelementptr i8, ptr addrspace(3) %in, i32 65535 %val = load i8, ptr addrspace(3) %gep, align 4 store i8 %val, ptr addrspace(1) %out, align 4 @@ -56,7 +56,7 @@ define amdgpu_kernel void @local_i8_load_i16_max_offset(ptr addrspace(1) %out, p ; GCN-DAG: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] ; GCN: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] ; GCN: buffer_store_byte [[REG]], -define amdgpu_kernel void @local_i8_load_over_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { +define amdgpu_kernel void @local_i8_load_over_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { %gep = getelementptr i8, ptr addrspace(3) %in, i32 65536 %val = load i8, ptr addrspace(3) %gep, align 4 store i8 %val, ptr addrspace(1) %out, align 4 @@ -70,7 +70,7 @@ define amdgpu_kernel void @local_i8_load_over_i16_max_offset(ptr addrspace(1) %o ; GCN-NOT: add ; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_i64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { +define amdgpu_kernel void @local_i64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { %gep = getelementptr i64, ptr addrspace(3) %in, i32 7 %val = load i64, ptr addrspace(3) %gep, align 8 store i64 %val, ptr addrspace(1) %out, align 8 @@ -83,7 +83,7 @@ define amdgpu_kernel void @local_i64_load(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_i64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { +define amdgpu_kernel void @local_i64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { %val = load i64, ptr addrspace(3) %in, align 8 store i64 %val, ptr addrspace(1) %out, align 8 ret void @@ -96,7 +96,7 @@ define amdgpu_kernel void @local_i64_load_0_offset(ptr addrspace(1) %out, ptr ad ; GCN-NOT: add ; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_f64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { +define amdgpu_kernel void @local_f64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { %gep = getelementptr double, ptr addrspace(3) %in, i32 7 %val = load double, ptr addrspace(3) %gep, align 8 store double %val, ptr addrspace(1) %out, align 8 @@ -109,7 +109,7 @@ define amdgpu_kernel void @local_f64_load(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_f64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { +define amdgpu_kernel void @local_f64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { %val = load double, ptr addrspace(3) %in, align 8 store double %val, ptr addrspace(1) %out, align 8 ret void @@ -121,7 +121,7 @@ define amdgpu_kernel void @local_f64_load_0_offset(ptr addrspace(1) %out, ptr ad ; GCN-NOT: add ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define amdgpu_kernel void @local_i64_store(ptr addrspace(3) %out) nounwind { +define amdgpu_kernel void @local_i64_store(ptr addrspace(3) %out) #0 { %gep = getelementptr i64, ptr addrspace(3) %out, i32 7 store i64 5678, ptr addrspace(3) %gep, align 8 ret void @@ -133,7 +133,7 @@ define amdgpu_kernel void @local_i64_store(ptr addrspace(3) %out) nounwind { ; GCN-NOT: add ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @local_i64_store_0_offset(ptr addrspace(3) %out) nounwind { +define amdgpu_kernel void @local_i64_store_0_offset(ptr addrspace(3) %out) #0 { store i64 1234, ptr addrspace(3) %out, align 8 ret void } @@ -144,7 +144,7 @@ define amdgpu_kernel void @local_i64_store_0_offset(ptr addrspace(3) %out) nounw ; GCN-NOT: add ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define amdgpu_kernel void @local_f64_store(ptr addrspace(3) %out) nounwind { +define amdgpu_kernel void @local_f64_store(ptr addrspace(3) %out) #0 { %gep = getelementptr double, ptr addrspace(3) %out, i32 7 store double 16.0, ptr addrspace(3) %gep, align 8 ret void @@ -155,7 +155,7 @@ define amdgpu_kernel void @local_f64_store(ptr addrspace(3) %out) nounwind { ; GFX9-NOT: m0 ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @local_f64_store_0_offset(ptr addrspace(3) %out) nounwind { +define amdgpu_kernel void @local_f64_store_0_offset(ptr addrspace(3) %out) #0 { store double 20.0, ptr addrspace(3) %out, align 8 ret void } @@ -168,7 +168,7 @@ define amdgpu_kernel void @local_f64_store_0_offset(ptr addrspace(3) %out) nounw ; SI: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 ; CIPLUS: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 ; GCN: s_endpgm -define amdgpu_kernel void @local_v2i64_store(ptr addrspace(3) %out) nounwind { +define amdgpu_kernel void @local_v2i64_store(ptr addrspace(3) %out) #0 { %gep = getelementptr <2 x i64>, ptr addrspace(3) %out, i32 7 store <2 x i64> , ptr addrspace(3) %gep, align 16 ret void @@ -184,7 +184,7 @@ define amdgpu_kernel void @local_v2i64_store(ptr addrspace(3) %out) nounwind { ; CIPLUS: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}} ; GCN: s_endpgm -define amdgpu_kernel void @local_v2i64_store_0_offset(ptr addrspace(3) %out) nounwind { +define amdgpu_kernel void @local_v2i64_store_0_offset(ptr addrspace(3) %out) #0 { store <2 x i64> , ptr addrspace(3) %out, align 16 ret void } @@ -201,7 +201,7 @@ define amdgpu_kernel void @local_v2i64_store_0_offset(ptr addrspace(3) %out) nou ; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240{{$}} ; GCN: s_endpgm -define amdgpu_kernel void @local_v4i64_store(ptr addrspace(3) %out) nounwind { +define amdgpu_kernel void @local_v4i64_store(ptr addrspace(3) %out) #0 { %gep = getelementptr <4 x i64>, ptr addrspace(3) %out, i32 7 store <4 x i64> , ptr addrspace(3) %gep, align 16 ret void @@ -219,7 +219,9 @@ define amdgpu_kernel void @local_v4i64_store(ptr addrspace(3) %out) nounwind { ; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16{{$}} ; GCN: s_endpgm -define amdgpu_kernel void @local_v4i64_store_0_offset(ptr addrspace(3) %out) nounwind { +define amdgpu_kernel void @local_v4i64_store_0_offset(ptr addrspace(3) %out) #0 { store <4 x i64> , ptr addrspace(3) %out, align 16 ret void } + +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 03ee6a325fbbc..d8a790c718408 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -6979,28 +6979,28 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; GFX12-LABEL: local_ds_fadd: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 ; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s3, s5, 4 -; GFX12-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX12-NEXT: s_add_co_i32 s1, s5, 4 +; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB28_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX12-NEXT: s_lshl_b32 s5, s3, 3 +; GFX12-NEXT: s_lshl_b32 s5, s1, 3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -7009,20 +7009,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX12-NEXT: s_cbranch_execz .LBB28_4 ; GFX12-NEXT: ; %bb.3: -; GFX12-NEXT: s_bcnt1_i32_b32 s2, s7 +; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX12-NEXT: s_lshl_b32 s2, s3, 4 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX12-NEXT: s_lshl_b32 s0, s1, 4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_4: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX12-NEXT: s_mov_b32 s3, exec_lo -; GFX12-NEXT: s_brev_b32 s2, 1 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_brev_b32 s0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX12-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7031,32 +7031,32 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB28_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_ctz_i32_b32 s5, s3 +; GFX12-NEXT: s_ctz_i32_b32 s5, s1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s5 ; GFX12-NEXT: s_lshl_b32 s7, 1, s5 -; GFX12-NEXT: v_writelane_b32 v0, s2, s5 -; GFX12-NEXT: s_and_not1_b32 s3, s3, s7 +; GFX12-NEXT: v_writelane_b32 v0, s0, s5 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_lg_u32 s3, 0 -; GFX12-NEXT: s_add_f32 s2, s2, s6 +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: ; implicit-def: $vgpr1 -; GFX12-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execz .LBB28_8 ; GFX12-NEXT: ; %bb.7: -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_8: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 @@ -7069,10 +7069,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX940-LABEL: local_ds_fadd: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_add_i32 s5, s5, 4 @@ -7080,9 +7080,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX940-NEXT: s_cbranch_execz .LBB28_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: s_lshl_b32 s8, s5, 3 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7093,15 +7093,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_readfirstlane_b32 s10, v1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX940-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX940-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX940-NEXT: s_cbranch_execz .LBB28_4 ; GFX940-NEXT: ; %bb.3: -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX940-NEXT: s_lshl_b32 s2, s5, 4 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX940-NEXT: s_lshl_b32 s0, s5, 4 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f32 v2, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB28_4: @@ -7110,20 +7110,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX940-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, s10 -; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b64 s[0:1], exec ; GFX940-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX940-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX940-NEXT: ; implicit-def: $vgpr0 ; GFX940-NEXT: .LBB28_5: ; %ComputeLoop ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX940-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX940-NEXT: v_readfirstlane_b32 s8, v1 ; GFX940-NEXT: v_readlane_b32 s9, v2, s5 ; GFX940-NEXT: s_mov_b32 m0, s5 -; GFX940-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX940-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX940-NEXT: v_writelane_b32 v0, s8, m0 -; GFX940-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX940-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX940-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX940-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX940-NEXT: ; %bb.6: ; %ComputeEnd @@ -7131,16 +7131,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX940-NEXT: ; implicit-def: $vgpr2 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: s_cbranch_execz .LBB28_8 ; GFX940-NEXT: ; %bb.7: ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB28_8: -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX940-NEXT: v_readfirstlane_b32 s2, v2 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_nop 0 @@ -7153,28 +7153,28 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX11-LABEL: local_ds_fadd: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 ; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s5, 4 -; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX11-NEXT: s_add_i32 s1, s5, 4 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX11-NEXT: s_lshl_b32 s5, s3, 3 +; GFX11-NEXT: s_lshl_b32 s5, s1, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB28_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 s7, exec_lo ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7183,12 +7183,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX11-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-NEXT: ; %bb.3: -; GFX11-NEXT: s_bcnt1_i32_b32 s2, s7 +; GFX11-NEXT: s_bcnt1_i32_b32 s0, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX11-NEXT: s_lshl_b32 s2, s3, 4 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX11-NEXT: s_lshl_b32 s0, s1, 4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_f32 v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -7196,7 +7196,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX11-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7205,25 +7205,25 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: .LBB28_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-NEXT: s_ctz_i32_b32 s1, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readlane_b32 s6, v2, s3 -; GFX11-NEXT: s_lshl_b32 s7, 1, s3 +; GFX11-NEXT: v_readlane_b32 s6, v2, s1 +; GFX11-NEXT: s_lshl_b32 s7, 1, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_and_not1_b32 s2, s2, s7 -; GFX11-NEXT: v_writelane_b32 v0, s5, s3 +; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 +; GFX11-NEXT: v_writelane_b32 v0, s5, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: ; implicit-def: $vgpr2 -; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB28_8 ; GFX11-NEXT: ; %bb.7: ; GFX11-NEXT: v_mov_b32_e32 v2, s4 @@ -7231,8 +7231,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB28_8: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 @@ -7245,19 +7245,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX10-LABEL: local_ds_fadd: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s3, s5, 4 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_add_i32 s1, s5, 4 +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB28_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s3, 3 +; GFX10-NEXT: s_lshl_b32 s5, s1, 3 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7265,18 +7265,18 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB28_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 -; GFX10-NEXT: s_and_saveexec_b32 s6, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX10-NEXT: s_and_saveexec_b32 s6, s0 ; GFX10-NEXT: s_cbranch_execz .LBB28_4 ; GFX10-NEXT: ; %bb.3: -; GFX10-NEXT: s_bcnt1_i32_b32 s2, s7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s3, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_bcnt1_i32_b32 s0, s7 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX10-NEXT: s_lshl_b32 s0, s1, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_add_f32 v2, v1 @@ -7287,28 +7287,28 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: s_mov_b32 s0, exec_lo ; GFX10-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, s5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s5, vcc_lo ; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: .LBB28_5: ; %ComputeLoop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_ff1_i32_b32 s3, s2 +; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-NEXT: v_readlane_b32 s6, v2, s3 -; GFX10-NEXT: s_lshl_b32 s7, 1, s3 -; GFX10-NEXT: s_andn2_b32 s2, s2, s7 -; GFX10-NEXT: v_writelane_b32 v0, s5, s3 +; GFX10-NEXT: v_readlane_b32 s6, v2, s1 +; GFX10-NEXT: s_lshl_b32 s7, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s7 +; GFX10-NEXT: v_writelane_b32 v0, s5, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB28_8 ; GFX10-NEXT: ; %bb.7: ; GFX10-NEXT: v_mov_b32_e32 v2, s4 @@ -7318,8 +7318,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB28_8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 null, 0 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_f32_e32 v0, s2, v0 @@ -7330,10 +7331,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX90A-LABEL: local_ds_fadd: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s5, s5, 4 @@ -7341,9 +7342,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB28_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: s_lshl_b32 s8, s5, 3 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7354,15 +7355,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX90A-NEXT: s_cbranch_execz .LBB28_4 ; GFX90A-NEXT: ; %bb.3: -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX90A-NEXT: s_lshl_b32 s2, s5, 4 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: s_lshl_b32 s0, s5, 4 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f32 v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB28_4: @@ -7371,20 +7372,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX90A-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s10 -; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b64 s[0:1], exec ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX90A-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 ; GFX90A-NEXT: v_readlane_b32 s9, v2, s5 ; GFX90A-NEXT: s_mov_b32 m0, s5 -; GFX90A-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd @@ -7392,16 +7393,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX90A-NEXT: s_cbranch_execz .LBB28_8 ; GFX90A-NEXT: ; %bb.7: ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB28_8: -; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 ; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 @@ -7413,10 +7414,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX908-LABEL: local_ds_fadd: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 -; GFX908-NEXT: s_mov_b64 s[2:3], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX908-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX908-NEXT: s_mov_b64 s[0:1], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s5, s5, 4 @@ -7424,9 +7425,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_cbranch_execz .LBB28_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX908-NEXT: s_lshl_b32 s8, s5, 3 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7437,15 +7438,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_readfirstlane_b32 s10, v1 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX908-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX908-NEXT: s_cbranch_execz .LBB28_4 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX908-NEXT: s_lshl_b32 s2, s5, 4 +; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: s_lshl_b32 s0, s5, 4 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, s2 +; GFX908-NEXT: v_mov_b32_e32 v2, s0 ; GFX908-NEXT: ds_add_f32 v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: .LBB28_4: @@ -7454,20 +7455,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX908-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX908-NEXT: v_mov_b32_e32 v1, s10 -; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: s_mov_b64 s[0:1], exec ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX908-NEXT: ; implicit-def: $vgpr0 ; GFX908-NEXT: .LBB28_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX908-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX908-NEXT: v_readfirstlane_b32 s8, v1 ; GFX908-NEXT: v_readlane_b32 s9, v2, s5 ; GFX908-NEXT: s_mov_b32 m0, s5 -; GFX908-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd @@ -7475,16 +7476,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX908-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX908-NEXT: s_cbranch_execz .LBB28_8 ; GFX908-NEXT: ; %bb.7: ; GFX908-NEXT: v_mov_b32_e32 v2, s4 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: .LBB28_8: -; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX908-NEXT: v_readfirstlane_b32 s2, v2 ; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX908-NEXT: v_mov_b32_e32 v2, s2 @@ -7496,10 +7497,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX8-LABEL: local_ds_fadd: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s5, s5, 4 @@ -7508,9 +7509,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB28_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX8-NEXT: s_lshl_b32 s8, s5, 3 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7521,15 +7522,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_readfirstlane_b32 s10, v1 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB28_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX8-NEXT: s_lshl_b32 s2, s5, 4 +; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: s_lshl_b32 s0, s5, 4 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: ds_add_f32 v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_4: @@ -7538,20 +7539,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: .LBB28_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_readfirstlane_b32 s8, v1 ; GFX8-NEXT: v_readlane_b32 s9, v2, s5 ; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd @@ -7559,8 +7560,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr2 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB28_8 ; GFX8-NEXT: ; %bb.7: ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -7568,8 +7569,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_8: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 @@ -7582,10 +7583,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX7-LABEL: local_ds_fadd: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; GFX7-NEXT: s_mov_b64 s[2:3], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 +; GFX7-NEXT: s_mov_b64 s[0:1], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s5, s5, 4 @@ -7597,8 +7598,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_lshl_b32 s8, s5, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: ds_read_b32 v1, v2 -; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 +; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_2: ; %atomicrmw.start @@ -7608,8 +7609,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 -; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_2 ; GFX7-NEXT: ; %bb.3: ; %Flow18 @@ -7620,15 +7621,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB28_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s2, s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshl_b32 s0, s5, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_read_b32 v3, v1 -; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 @@ -7637,8 +7638,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 @@ -7651,7 +7652,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_add_f32_e32 v2, s10, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: s_mov_b64 s[2:3], 0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: .LBB28_8: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7660,12 +7661,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB28_8 ; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7674,10 +7675,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX6-LABEL: local_ds_fadd: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s5, s5, 4 @@ -7689,8 +7690,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_lshl_b32 s8, s5, 3 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ds_read_b32 v1, v2 -; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 +; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_2: ; %atomicrmw.start @@ -7700,8 +7701,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 -; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_2 ; GFX6-NEXT: ; %bb.3: ; %Flow16 @@ -7712,15 +7713,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB28_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s2, s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshl_b32 s0, s5, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_read_b32 v3, v1 -; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 @@ -7729,8 +7730,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 @@ -7743,7 +7744,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_add_f32_e32 v2, s10, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: s_mov_b64 s[2:3], 0 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: .LBB28_8: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7752,12 +7753,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX6-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX6-NEXT: s_cbranch_execnz .LBB28_8 ; GFX6-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7778,26 +7779,26 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; GFX12-LABEL: local_ds_fadd_one_as: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 ; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s3, s5, 4 -; GFX12-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX12-NEXT: s_add_co_i32 s1, s5, 4 +; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB29_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX12-NEXT: s_lshl_b32 s5, s3, 3 +; GFX12-NEXT: s_lshl_b32 s5, s1, 3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: .LBB29_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7807,18 +7808,18 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX12-NEXT: s_cbranch_execz .LBB29_4 ; GFX12-NEXT: ; %bb.3: -; GFX12-NEXT: s_bcnt1_i32_b32 s2, s7 +; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX12-NEXT: s_lshl_b32 s2, s3, 4 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX12-NEXT: s_lshl_b32 s0, s1, 4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: .LBB29_4: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX12-NEXT: s_mov_b32 s3, exec_lo -; GFX12-NEXT: s_brev_b32 s2, 1 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_brev_b32 s0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX12-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7827,30 +7828,30 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB29_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_ctz_i32_b32 s5, s3 +; GFX12-NEXT: s_ctz_i32_b32 s5, s1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s5 ; GFX12-NEXT: s_lshl_b32 s7, 1, s5 -; GFX12-NEXT: v_writelane_b32 v0, s2, s5 -; GFX12-NEXT: s_and_not1_b32 s3, s3, s7 +; GFX12-NEXT: v_writelane_b32 v0, s0, s5 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_lg_u32 s3, 0 -; GFX12-NEXT: s_add_f32 s2, s2, s6 +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: ; implicit-def: $vgpr1 -; GFX12-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execz .LBB29_8 ; GFX12-NEXT: ; %bb.7: -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: .LBB29_8: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7864,10 +7865,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX940-LABEL: local_ds_fadd_one_as: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_add_i32 s5, s5, 4 @@ -7875,9 +7876,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX940-NEXT: s_cbranch_execz .LBB29_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: s_lshl_b32 s8, s5, 3 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7888,15 +7889,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_readfirstlane_b32 s10, v1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX940-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX940-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX940-NEXT: s_cbranch_execz .LBB29_4 ; GFX940-NEXT: ; %bb.3: -; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX940-NEXT: s_lshl_b32 s2, s5, 4 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX940-NEXT: s_lshl_b32 s0, s5, 4 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f32 v2, v1 ; GFX940-NEXT: .LBB29_4: ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] @@ -7904,20 +7905,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX940-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, s10 -; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b64 s[0:1], exec ; GFX940-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX940-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX940-NEXT: ; implicit-def: $vgpr0 ; GFX940-NEXT: .LBB29_5: ; %ComputeLoop ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX940-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX940-NEXT: v_readfirstlane_b32 s8, v1 ; GFX940-NEXT: v_readlane_b32 s9, v2, s5 ; GFX940-NEXT: s_mov_b32 m0, s5 -; GFX940-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX940-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX940-NEXT: v_writelane_b32 v0, s8, m0 -; GFX940-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX940-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX940-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX940-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX940-NEXT: ; %bb.6: ; %ComputeEnd @@ -7925,15 +7926,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX940-NEXT: ; implicit-def: $vgpr2 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: s_cbranch_execz .LBB29_8 ; GFX940-NEXT: ; %bb.7: ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX940-NEXT: .LBB29_8: -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_readfirstlane_b32 s2, v2 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 @@ -7946,26 +7947,26 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: local_ds_fadd_one_as: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 ; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s5, 4 -; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX11-NEXT: s_add_i32 s1, s5, 4 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-NEXT: s_cbranch_execz .LBB29_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX11-NEXT: s_lshl_b32 s5, s3, 3 +; GFX11-NEXT: s_lshl_b32 s5, s1, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX11-NEXT: .LBB29_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 s7, exec_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -7975,18 +7976,18 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX11-NEXT: s_cbranch_execz .LBB29_4 ; GFX11-NEXT: ; %bb.3: -; GFX11-NEXT: s_bcnt1_i32_b32 s2, s7 +; GFX11-NEXT: s_bcnt1_i32_b32 s0, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX11-NEXT: s_lshl_b32 s2, s3, 4 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX11-NEXT: s_lshl_b32 s0, s1, 4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_f32 v2, v1 ; GFX11-NEXT: .LBB29_4: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX11-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7995,32 +7996,32 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: .LBB29_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-NEXT: s_ctz_i32_b32 s1, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readlane_b32 s6, v2, s3 -; GFX11-NEXT: s_lshl_b32 s7, 1, s3 +; GFX11-NEXT: v_readlane_b32 s6, v2, s1 +; GFX11-NEXT: s_lshl_b32 s7, 1, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_and_not1_b32 s2, s2, s7 -; GFX11-NEXT: v_writelane_b32 v0, s5, s3 +; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 +; GFX11-NEXT: v_writelane_b32 v0, s5, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: ; implicit-def: $vgpr2 -; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB29_8 ; GFX11-NEXT: ; %bb.7: ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX11-NEXT: .LBB29_8: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8033,37 +8034,37 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: local_ds_fadd_one_as: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s3, s5, 4 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_add_i32 s1, s5, 4 +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB29_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s3, 3 +; GFX10-NEXT: s_lshl_b32 s5, s1, 3 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX10-NEXT: .LBB29_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 -; GFX10-NEXT: s_and_saveexec_b32 s6, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX10-NEXT: s_and_saveexec_b32 s6, s0 ; GFX10-NEXT: s_cbranch_execz .LBB29_4 ; GFX10-NEXT: ; %bb.3: -; GFX10-NEXT: s_bcnt1_i32_b32 s2, s7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s3, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_bcnt1_i32_b32 s0, s7 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX10-NEXT: s_lshl_b32 s0, s1, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_f32 v2, v1 ; GFX10-NEXT: .LBB29_4: @@ -8071,36 +8072,36 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: s_mov_b32 s0, exec_lo ; GFX10-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, s5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s5, vcc_lo ; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: .LBB29_5: ; %ComputeLoop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_ff1_i32_b32 s3, s2 +; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-NEXT: v_readlane_b32 s6, v2, s3 -; GFX10-NEXT: s_lshl_b32 s7, 1, s3 -; GFX10-NEXT: s_andn2_b32 s2, s2, s7 -; GFX10-NEXT: v_writelane_b32 v0, s5, s3 +; GFX10-NEXT: v_readlane_b32 s6, v2, s1 +; GFX10-NEXT: s_lshl_b32 s7, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s7 +; GFX10-NEXT: v_writelane_b32 v0, s5, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB29_8 ; GFX10-NEXT: ; %bb.7: ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX10-NEXT: .LBB29_8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -8112,10 +8113,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX90A-LABEL: local_ds_fadd_one_as: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 -; GFX90A-NEXT: s_mov_b64 s[2:3], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s5, s5, 4 @@ -8123,9 +8124,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB29_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: s_lshl_b32 s8, s5, 3 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -8136,15 +8137,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX90A-NEXT: s_cbranch_execz .LBB29_4 ; GFX90A-NEXT: ; %bb.3: -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX90A-NEXT: s_lshl_b32 s2, s5, 4 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: s_lshl_b32 s0, s5, 4 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f32 v2, v1 ; GFX90A-NEXT: .LBB29_4: ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] @@ -8152,20 +8153,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX90A-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s10 -; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b64 s[0:1], exec ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX90A-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 ; GFX90A-NEXT: v_readlane_b32 s9, v2, s5 ; GFX90A-NEXT: s_mov_b32 m0, s5 -; GFX90A-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd @@ -8173,15 +8174,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX90A-NEXT: s_cbranch_execz .LBB29_8 ; GFX90A-NEXT: ; %bb.7: ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX90A-NEXT: .LBB29_8: -; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 ; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8193,10 +8194,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX908-LABEL: local_ds_fadd_one_as: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 -; GFX908-NEXT: s_mov_b64 s[2:3], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX908-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX908-NEXT: s_mov_b64 s[0:1], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s5, s5, 4 @@ -8204,9 +8205,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_cbranch_execz .LBB29_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX908-NEXT: s_lshl_b32 s8, s5, 3 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -8217,15 +8218,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_readfirstlane_b32 s10, v1 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX908-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX908-NEXT: s_cbranch_execz .LBB29_4 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX908-NEXT: s_lshl_b32 s2, s5, 4 +; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: s_lshl_b32 s0, s5, 4 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, s2 +; GFX908-NEXT: v_mov_b32_e32 v2, s0 ; GFX908-NEXT: ds_add_f32 v2, v1 ; GFX908-NEXT: .LBB29_4: ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] @@ -8233,20 +8234,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX908-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX908-NEXT: v_mov_b32_e32 v1, s10 -; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: s_mov_b64 s[0:1], exec ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX908-NEXT: ; implicit-def: $vgpr0 ; GFX908-NEXT: .LBB29_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX908-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX908-NEXT: v_readfirstlane_b32 s8, v1 ; GFX908-NEXT: v_readlane_b32 s9, v2, s5 ; GFX908-NEXT: s_mov_b32 m0, s5 -; GFX908-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd @@ -8254,15 +8255,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX908-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX908-NEXT: s_cbranch_execz .LBB29_8 ; GFX908-NEXT: ; %bb.7: ; GFX908-NEXT: v_mov_b32_e32 v2, s4 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX908-NEXT: .LBB29_8: -; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s2, v2 ; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8274,10 +8275,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: local_ds_fadd_one_as: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s5, s5, 4 @@ -8286,9 +8287,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB29_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX8-NEXT: s_lshl_b32 s8, s5, 3 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -8299,15 +8300,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_readfirstlane_b32 s10, v1 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB29_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 -; GFX8-NEXT: s_lshl_b32 s2, s5, 4 +; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: s_lshl_b32 s0, s5, 4 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: ds_add_f32 v2, v1 ; GFX8-NEXT: .LBB29_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] @@ -8315,20 +8316,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: .LBB29_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_readfirstlane_b32 s8, v1 ; GFX8-NEXT: v_readlane_b32 s9, v2, s5 ; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd @@ -8336,16 +8337,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr2 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB29_8 ; GFX8-NEXT: ; %bb.7: ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX8-NEXT: .LBB29_8: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8358,10 +8359,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: local_ds_fadd_one_as: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; GFX7-NEXT: s_mov_b64 s[2:3], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 +; GFX7-NEXT: s_mov_b64 s[0:1], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s5, s5, 4 @@ -8373,8 +8374,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_lshl_b32 s8, s5, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: ds_read_b32 v1, v2 -; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 +; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_2: ; %atomicrmw.start @@ -8384,8 +8385,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 -; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_2 ; GFX7-NEXT: ; %bb.3: ; %Flow18 @@ -8396,15 +8397,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB29_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s2, s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshl_b32 s0, s5, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_read_b32 v3, v1 -; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 @@ -8413,8 +8414,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 @@ -8427,7 +8428,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_add_f32_e32 v2, s10, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: s_mov_b64 s[2:3], 0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: .LBB29_8: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8436,12 +8437,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7-NEXT: s_cbranch_execnz .LBB29_8 ; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8450,10 +8451,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX6-LABEL: local_ds_fadd_one_as: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s5, s5, 4 @@ -8465,8 +8466,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_lshl_b32 s8, s5, 3 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ds_read_b32 v1, v2 -; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 +; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_2: ; %atomicrmw.start @@ -8476,8 +8477,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 -; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_2 ; GFX6-NEXT: ; %bb.3: ; %Flow16 @@ -8488,15 +8489,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB29_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s2, s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshl_b32 s0, s5, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_read_b32 v3, v1 -; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 @@ -8505,8 +8506,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 @@ -8519,7 +8520,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_add_f32_e32 v2, s10, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: s_mov_b64 s[2:3], 0 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: .LBB29_8: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8528,12 +8529,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX6-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX6-NEXT: s_cbranch_execnz .LBB29_8 ; GFX6-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index 8386a685a1a12..d068e2ae4ec97 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @local_memory(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 16, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_barrier ; GCN-NEXT: ds_read_b32 v0, v0 @@ -51,7 +51,7 @@ define amdgpu_kernel void @local_memory_two_objects(ptr addrspace(1) %out) #0 { ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 12, v1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_barrier ; SI-NEXT: v_sub_i32_e32 v2, vcc, 28, v1 @@ -73,7 +73,7 @@ define amdgpu_kernel void @local_memory_two_objects(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:3 offset1:7 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 52f97150e4b30..49531e3b4f8f3 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -19,20 +19,20 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-LABEL: local_stack_offset_uses_sp: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s9 +; MUBUF-NEXT: s_add_u32 s0, s0, s15 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_add_u32_e32 v0, 64, v1 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0x2000 -; MUBUF-NEXT: s_mov_b32 s6, 0 +; MUBUF-NEXT: s_mov_b32 s4, 0 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: .LBB0_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 -; MUBUF-NEXT: v_add_u32_e32 v3, s6, v1 -; MUBUF-NEXT: s_add_i32 s6, s6, 1 -; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 +; MUBUF-NEXT: v_add_u32_e32 v3, s4, v1 +; MUBUF-NEXT: s_add_i32 s4, s4, 1 +; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 ; MUBUF-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_1 @@ -47,7 +47,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; MUBUF-NEXT: v_mov_b32_e32 v6, 0 ; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 ; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc @@ -58,30 +58,30 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; ; FLATSCR-LABEL: local_stack_offset_uses_sp: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 -; FLATSCR-NEXT: scratch_store_dword off, v0, s2 +; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s2, 0 +; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: .LBB0_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_i32 s3, s2, 0x3000 -; FLATSCR-NEXT: s_add_i32 s2, s2, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s3 +; FLATSCR-NEXT: s_add_i32 s1, s0, 0x3000 +; FLATSCR-NEXT: s_add_i32 s0, s0, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s1 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 -; FLATSCR-NEXT: s_addk_i32 s2, 0x3000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:208 glc +; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: s_addk_i32 s0, 0x3000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s2, 0x3000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64 glc +; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:64 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v4, 0 ; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -201,19 +201,19 @@ entry: define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out) { ; MUBUF-LABEL: local_stack_offset_uses_sp_flat: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s9 +; MUBUF-NEXT: s_add_u32 s0, s0, s15 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x2000 -; MUBUF-NEXT: s_mov_b32 s6, 0 +; MUBUF-NEXT: s_mov_b32 s4, 0 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: .LBB2_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 -; MUBUF-NEXT: v_add_u32_e32 v2, s6, v0 -; MUBUF-NEXT: s_add_i32 s6, s6, 1 -; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 +; MUBUF-NEXT: v_add_u32_e32 v2, s4, v0 +; MUBUF-NEXT: s_add_i32 s4, s4, 1 +; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 ; MUBUF-NEXT: buffer_store_byte v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1 @@ -251,7 +251,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: v_mov_b32_e32 v12, 0x4000 ; MUBUF-NEXT: buffer_load_dword v3, v10, s[0:3], 0 offen offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; MUBUF-NEXT: buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 @@ -272,33 +272,33 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; ; FLATSCR-LABEL: local_stack_offset_uses_sp_flat: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_mov_b32 s2, 0 -; FLATSCR-NEXT: scratch_store_dword off, v0, s2 offset:1024 +; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_i32 s3, s2, 0x2000 -; FLATSCR-NEXT: s_add_i32 s2, s2, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s3 +; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000 +; FLATSCR-NEXT: s_add_i32 s0, s0, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s1 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s2, 0x1000 -; FLATSCR-NEXT: s_addk_i32 s2, 0x2000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s2 offset:720 glc +; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 +; FLATSCR-NEXT: s_addk_i32 s0, 0x2000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 offset:704 glc +; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s2 offset:16 glc +; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:16 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s2 glc +; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s0 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v12, 0 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll index cc90d03e66715..7814eb603e554 100644 --- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll +++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll @@ -14,9 +14,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_max_short_forward_branch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART @@ -26,10 +26,10 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: .LBB0_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -55,9 +55,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.3: ; %bb0 ; GCN-NEXT: s_getpc_b64 s[8:9] @@ -73,10 +73,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -102,9 +102,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s2, 0 +; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_cbranch_vccz .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %bb0 @@ -122,10 +122,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -150,7 +150,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -254,28 +254,28 @@ bb3: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { ; GCN-LABEL: uniform_unconditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s2, 0 -; GCN-NEXT: s_mov_b64 s[2:3], -1 +; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_mov_b64 s[0:1], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB5_1 ; GCN-NEXT: ; %bb.7: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[8:9] ; GCN-NEXT: .Lpost_getpc5: -; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_4-.Lpost_getpc5)&4294967295 -; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_4-.Lpost_getpc5)>>32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_4-.Lpost_getpc5)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_4-.Lpost_getpc5)>>32 +; GCN-NEXT: s_setpc_b64 s[8:9] ; GCN-NEXT: .LBB5_1: ; %Flow -; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_cbranch_vccnz .LBB5_3 ; GCN-NEXT: .LBB5_2: ; %bb2 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB5_3: ; %bb4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -294,17 +294,17 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_mov_b64 vcc, exec ; GCN-NEXT: s_cbranch_execnz .LBB5_5 ; GCN-NEXT: ; %bb.9: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[8:9] ; GCN-NEXT: .Lpost_getpc6: -; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_2-.Lpost_getpc6)&4294967295 -; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_2-.Lpost_getpc6)>>32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_2-.Lpost_getpc6)>>32 +; GCN-NEXT: s_setpc_b64 s[8:9] ; GCN-NEXT: .LBB5_5: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[8:9] ; GCN-NEXT: .Lpost_getpc4: -; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_3-.Lpost_getpc4)&4294967295 -; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_3-.Lpost_getpc4)>>32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_3-.Lpost_getpc4)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_3-.Lpost_getpc4)>>32 +; GCN-NEXT: s_setpc_b64 s[8:9] bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index f19eeee1ca741..390d1d70ff2aa 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -4,12 +4,12 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_flat: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GCN-NEXT: .LBB0_2: ; %for.body @@ -50,12 +50,12 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_global: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 @@ -96,12 +96,12 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_constant: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB2_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: .LBB2_2: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -143,7 +143,7 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_local: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index df3b2135e72ac..5484ba1ed2fe0 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -32,7 +32,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 { ; ; GCN-LABEL: break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: undef_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -207,7 +207,7 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: constexpr_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -297,7 +297,7 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: true_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -386,7 +386,7 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: false_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -479,7 +479,7 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: invert_true_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll index cb3ea2e812770..7998d430d5f90 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -24,7 +24,7 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce ; GCN-NEXT: ds_write_b8 v0, v1 ; GCN-NEXT: ds_read_u8 v2, v0 offset:2 ; GCN-NEXT: ds_read_u16 v3, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b8 v0, v2 offset:6 ; GCN-NEXT: ds_write_b16 v0, v3 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll index c6a734a065ff1..00dcff093c7db 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -26,18 +26,18 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2(ptr addrspace(1) %arg, i ; ; GCN-LABEL: no_clobber_ds_load_stores_x2: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_load_dword s0, s[2:3], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 2 ; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_write_b32 v1, v2 offset:256 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:256 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: global_store_dword v1, v0, s[0:1] @@ -74,21 +74,21 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i ; ; GCN-LABEL: no_clobber_ds_load_stores_x3: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_load_dword s0, s[2:3], 0x2c ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 2 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: ds_write_b32 v1, v2 offset:256 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_lshl_b32 s0, s0, 2 ; GCN-NEXT: v_mov_b32_e32 v2, 3 ; GCN-NEXT: ds_write_b32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_write_b32 v1, v2 offset:512 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v3, v0 offset:256 ; GCN-NEXT: ds_read_b32 v0, v0 offset:512 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index 00d01a080ad14..9bbcc6988e311 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -161,24 +161,29 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, f0@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, f0@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, f1@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, f1@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm call void @f0() call void @f1() @@ -195,28 +200,36 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 1 - +; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 - +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm + + call void @f2() call void @f3() ret void @@ -237,30 +250,35 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, f1@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, f1@gotpcrel32@hi+12 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s15, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v1, v0 offset:16 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, f2@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, f2@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: ds_write_b8 v0, v1 offset:16 -; GCN-NEXT: s_mov_b32 s15, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm call void @f1() %ld = load i8, ptr addrspace(3) @v3 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index d3cc60c501fd7..72a0aceaae12b 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -226,29 +226,37 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 0 - +; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 - +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm + + call void @f0() call void @f1() ret void @@ -265,28 +273,36 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 2 - +; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 - +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm + + call void @f2() call void @f3() ret void @@ -307,33 +323,41 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 1 - +; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v1, v0 offset:2 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: ds_write_b8 v0, v1 offset:2 - -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm + + call void @f1() %ld = load i8, ptr addrspace(3) @v3 %mul = mul i8 %ld, 8 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index 1429251fc6421..fef1b57db5685 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -1,19 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_ids_kernel() { -; GFX9-LABEL: workgroup_ids_kernel: -; GFX9: ; %bb.0: ; %.entry -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: workgroup_ids_kernel: +; GFX9-SDAG: ; %bb.0: ; %.entry +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: workgroup_ids_kernel: +; GFX9-GISEL: ; %bb.0: ; %.entry +; GFX9-GISEL-NEXT: s_mov_b32 s0, s6 +; GFX9-GISEL-NEXT: s_mov_b32 s1, s7 +; GFX9-GISEL-NEXT: s_mov_b32 s2, s8 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-GISEL-NEXT: s_endpgm ; ; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel: ; GFX9ARCH-SDAG: ; %bb.0: ; %.entry @@ -72,20 +83,27 @@ define amdgpu_kernel void @workgroup_ids_kernel() { define amdgpu_kernel void @caller() { ; GFX9-SDAG-LABEL: caller: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_add_u32 flat_scratch_lo, s10, s13 -; GFX9-SDAG-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s13 -; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-SDAG-NEXT: s_getpc_b64 s[8:9] -; GFX9-SDAG-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 -; GFX9-SDAG-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 -; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 +; GFX9-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-SDAG-NEXT: s_mov_b32 s38, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-SDAG-NEXT: s_add_u32 s36, s36, s9 +; GFX9-SDAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-SDAG-NEXT: s_add_u32 s8, s2, 36 +; GFX9-SDAG-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-SDAG-NEXT: s_getpc_b64 s[2:3] +; GFX9-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 +; GFX9-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x0 +; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-SDAG-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-SDAG-NEXT: s_mov_b32 s12, s6 +; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[14:15] @@ -93,20 +111,27 @@ define amdgpu_kernel void @caller() { ; ; GFX9-GISEL-LABEL: caller: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_add_u32 flat_scratch_lo, s10, s13 -; GFX9-GISEL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s13 -; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-GISEL-NEXT: s_getpc_b64 s[8:9] -; GFX9-GISEL-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 -; GFX9-GISEL-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-GISEL-NEXT: s_mov_b32 s38, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-GISEL-NEXT: s_add_u32 s36, s36, s9 +; GFX9-GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-GISEL-NEXT: s_add_u32 s8, s2, 36 +; GFX9-GISEL-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-GISEL-NEXT: s_mov_b32 s12, s6 ; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15] @@ -114,61 +139,81 @@ define amdgpu_kernel void @caller() { ; ; GFX9ARCH-SDAG-LABEL: caller: ; GFX9ARCH-SDAG: ; %bb.0: -; GFX9ARCH-SDAG-NEXT: s_add_u32 flat_scratch_lo, s10, s12 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9ARCH-SDAG-NEXT: s_add_u32 s0, s0, s12 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s1, s1, 0 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[8:9] -; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 -; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s38, -1 +; GFX9ARCH-SDAG-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s36, s36, s6 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s2, 36 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s3, 0 +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[2:3] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9ARCH-SDAG-NEXT: s_endpgm ; ; GFX9ARCH-GISEL-LABEL: caller: ; GFX9ARCH-GISEL: ; %bb.0: -; GFX9ARCH-GISEL-NEXT: s_add_u32 flat_scratch_lo, s10, s12 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, s12 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[8:9] -; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 -; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s38, -1 +; GFX9ARCH-GISEL-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s36, s36, s6 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s2, 36 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s3, 0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9ARCH-GISEL-NEXT: s_endpgm ; -; GFX12-LABEL: caller: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-NEXT: s_getpc_b64 s[4:5] -; GFX12-NEXT: s_sext_i32_i16 s5, s5 -; GFX12-NEXT: s_add_co_u32 s4, s4, callee@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s5, s5, callee@gotpcrel32@hi+16 -; GFX12-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-NEXT: s_mov_b32 s32, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-LABEL: caller: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-SDAG-NEXT: s_mov_b32 s7, callee@abs32@hi +; GFX12-SDAG-NEXT: s_mov_b32 s6, callee@abs32@lo +; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: caller: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-GISEL-NEXT: s_mov_b32 s6, callee@abs32@lo +; GFX12-GISEL-NEXT: s_mov_b32 s7, callee@abs32@hi +; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() call void @callee(i32 %idx) #0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll index 7830bfc6ac7f5..2963e7b765a0d 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -24,8 +24,8 @@ define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 define amdgpu_kernel void @sext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -45,8 +45,8 @@ define amdgpu_kernel void @sext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 define amdgpu_kernel void @zext_shl64_overflow(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -67,8 +67,8 @@ define amdgpu_kernel void @zext_shl64_overflow(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @sext_shl64_overflow(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -89,7 +89,7 @@ define amdgpu_kernel void @sext_shl64_overflow(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: mulu24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: v_and_b32_e32 v0, 6, v0 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -112,7 +112,7 @@ bb: define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) { ; GCN-LABEL: muli24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 994ef22539a65..e8ac1b2887c36 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; VI-LABEL: s_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s2, 0xffff ; VI-NEXT: s_lshr_b32 s2, s2, 16 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; CI-LABEL: s_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -54,7 +54,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX10-LABEL: s_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 @@ -63,7 +63,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX11-LABEL: s_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 @@ -79,7 +79,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -90,7 +90,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -109,7 +109,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -142,7 +142,9 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -167,20 +169,20 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s2, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -201,22 +203,22 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dword s0, s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s0, s8, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, s0, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, s0, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -224,9 +226,10 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: lshr_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -236,9 +239,12 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: lshr_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -260,20 +266,20 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s2 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -294,22 +300,22 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dword s0, s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s0, s8, 16 -; CI-NEXT: s_and_b32 s1, s8, 0xffff -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: s_and_b32 s0, s0, 0xffff +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshr_b32_e32 v3, s0, v3 -; CI-NEXT: v_lshr_b32_e32 v2, s1, v2 +; CI-NEXT: v_lshr_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshr_b32_e32 v2, s0, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -317,9 +323,10 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: lshr_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -329,9 +336,12 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: lshr_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -353,7 +363,7 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -364,7 +374,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -384,7 +394,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -404,7 +414,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -415,7 +425,9 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -438,7 +450,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -449,7 +461,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -468,7 +480,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -485,7 +497,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -496,7 +508,9 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -519,7 +533,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -531,7 +545,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -553,7 +567,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -582,7 +596,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -594,7 +608,9 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_lshr_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -620,7 +636,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -632,7 +648,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -654,7 +670,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -673,7 +689,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -685,7 +701,9 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll index 995c8c8679397..3032b1028dc2d 100644 --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @mad_u16( ; GFX8-LABEL: mad_u16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -35,7 +35,7 @@ define amdgpu_kernel void @mad_u16( ; ; GFX9-LABEL: mad_u16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -51,7 +51,7 @@ define amdgpu_kernel void @mad_u16( ; ; GFX10-LABEL: mad_u16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -67,8 +67,10 @@ define amdgpu_kernel void @mad_u16( ; ; GFX11-LABEL: mad_u16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll index 620566d3baff3..e876a8d9dda69 100644 --- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 ; GCN-LABEL: {{^}}get_global_id_0: ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff -; GCN: s_mul_i32 [[MUL:s[0-9]+]], s8, [[WGSIZEX]] +; GCN: s_mul_i32 [[MUL:s[0-9]+]], s10, [[WGSIZEX]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 { %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 400298bcff4f9..8eb0a46cc8b17 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -908,8 +908,8 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 { define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; CI-LABEL: mad_i64_i32_uniform: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -924,33 +924,33 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; ; SI-LABEL: mad_i64_i32_uniform: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mul_hi_u32 v1, s6, v0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mul_i32 s4, s6, s7 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mul_i32 s2, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; GFX9-LABEL: mad_i64_i32_uniform: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s1, s6, s7 -; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: s_mul_i32 s3, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s2, s6, s7 +; GFX9-NEXT: s_add_u32 s0, s3, s0 +; GFX9-NEXT: s_addc_u32 s1, s2, s1 ; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -958,8 +958,8 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX11-LABEL: mad_i64_i32_uniform: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s2, s6, s7 ; GFX11-NEXT: s_mul_hi_u32 s3, s6, s7 @@ -975,8 +975,8 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX12-LABEL: mad_i64_i32_uniform: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_mov_b32 s3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index 9ec37a5e14cdf..b8b4d4440d580 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -15,18 +15,18 @@ declare float @llvm.fabs.f32(float) nounwind readnone define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_madak_f32 v2, v2, v3, 0x41200000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -34,8 +34,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -56,12 +56,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -70,13 +70,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-MAD-LABEL: madak_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -85,8 +85,10 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-MAD-LABEL: madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -103,12 +105,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX940-FMA-LABEL: madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -117,13 +120,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-FMA-LABEL: madak_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -132,8 +135,10 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-FMA-LABEL: madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -165,7 +170,7 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { ; GFX6-LABEL: madak_2_use_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -190,7 +195,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX8-LABEL: madak_2_use_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -220,7 +225,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX9-LABEL: madak_2_use_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -240,7 +245,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-MAD-LABEL: madak_2_use_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -259,7 +264,9 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX11-MAD-LABEL: madak_2_use_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -269,9 +276,9 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; GFX11-MAD-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-MAD-NEXT: v_dual_add_f32 v1, 0x41200000, v1 :: v_dual_add_f32 v2, 0x41200000, v2 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-MAD-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_add_f32 v2, 0x41200000, v2 +; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc @@ -282,7 +289,8 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX940-FMA-LABEL: madak_2_use_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) @@ -302,7 +310,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-FMA-LABEL: madak_2_use_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -321,7 +329,9 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX11-FMA-LABEL: madak_2_use_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -365,7 +375,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 { ; GFX6-LABEL: madak_m_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -381,7 +391,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX8-LABEL: madak_m_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -398,7 +408,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX9-LABEL: madak_m_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -409,7 +419,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX10-MAD-LABEL: madak_m_inline_imm_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] @@ -420,13 +430,14 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX11-MAD-LABEL: madak_m_inline_imm_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, 4.0, v1 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-MAD-NEXT: s_nop 0 @@ -435,7 +446,8 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX940-FMA-LABEL: madak_m_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] @@ -446,7 +458,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX10-FMA-LABEL: madak_m_inline_imm_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] @@ -457,7 +469,9 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX11-FMA-LABEL: madak_m_inline_imm_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] @@ -484,18 +498,18 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, v3, 4.0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -503,8 +517,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX8-LABEL: madak_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -525,12 +539,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX9-LABEL: madak_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, v2, 4.0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -539,13 +553,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX10-MAD-LABEL: madak_inline_imm_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, v2, 4.0 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -554,8 +568,10 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-MAD-LABEL: madak_inline_imm_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -572,12 +588,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX940-FMA-LABEL: madak_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -586,13 +603,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX10-FMA-LABEL: madak_inline_imm_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -601,8 +618,10 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-FMA-LABEL: madak_inline_imm_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -632,26 +651,26 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 { ; GFX6-LABEL: s_v_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s8, v2 +; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_v_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -669,22 +688,23 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; ; GFX9-LABEL: s_v_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mac_f32_e32 v2, s2, v1 +; GFX9-NEXT: v_mac_f32_e32 v2, s0, v1 ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_v_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_clause 0x1 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) @@ -695,14 +715,15 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-MAD-LABEL: s_v_madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s0, v1 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-MAD-NEXT: s_nop 0 @@ -711,22 +732,24 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; ; GFX940-FMA-LABEL: s_v_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s2, v1 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s0, v1 ; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_v_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_clause 0x1 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) @@ -737,8 +760,10 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-FMA-LABEL: s_v_madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] @@ -763,82 +788,84 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: v_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s2, v2 +; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: v_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mac_f32_e32 v2, s0, v3 +; GFX8-NEXT: v_mac_f32_e32 v2, s2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e32 v2, s4, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[2:3] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: v_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-MAD-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-MAD-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s4, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: v_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] @@ -848,44 +875,47 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a ; ; GFX940-FMA-LABEL: v_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] -; GFX940-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s4, v1 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: v_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-FMA-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: v_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000 +; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -905,7 +935,7 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-LABEL: s_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x41200000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -919,7 +949,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX8-LABEL: s_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -931,7 +961,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX9-LABEL: s_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -942,7 +972,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX10-MAD-LABEL: s_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3 @@ -952,7 +982,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX11-MAD-LABEL: s_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v0, s2, s3 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -964,7 +994,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX940-FMA-LABEL: s_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) @@ -975,7 +1005,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX10-FMA-LABEL: s_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3 @@ -985,7 +1015,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX11-FMA-LABEL: s_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1003,19 +1033,19 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src0_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, |v2|, v3, s0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -1023,8 +1053,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX8-LABEL: no_madak_src0_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1046,13 +1076,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src0_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, |v1|, v2, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1061,13 +1091,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX10-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, |v1|, v2, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -1076,8 +1106,10 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -1094,13 +1126,14 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX940-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, |v1|, v2, s0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -1109,13 +1142,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX10-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -1124,8 +1157,10 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1156,19 +1191,19 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src1_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, |v3|, s0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -1176,8 +1211,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX8-LABEL: no_madak_src1_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1199,13 +1234,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src1_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, |v2|, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1214,13 +1249,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX10-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, |v2|, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -1229,8 +1264,10 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -1247,13 +1284,14 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX940-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, |v2|, s0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -1262,13 +1300,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX10-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -1277,8 +1315,10 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1312,36 +1352,36 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { ; GFX6-LABEL: madak_constant_bus_violation: ; GFX6: ; %bb.0: ; %bb -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX6-NEXT: ; %bb.1: ; %bb3 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: .LBB9_2: ; %bb4 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x12 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0x12 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mac_f32_e64 v1, s0, 0.5 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: madak_constant_bus_violation: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 +; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; %bb3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1350,7 +1390,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX8-NEXT: .LBB9_2: ; %bb4 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x48 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x48 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1361,9 +1401,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX9-LABEL: madak_constant_bus_violation: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %bb3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1372,7 +1412,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX9-NEXT: .LBB9_2: ; %bb4 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x48 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x48 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1383,9 +1423,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-MAD-LABEL: madak_constant_bus_violation: ; GFX10-MAD: ; %bb.0: ; %bb -; GFX10-MAD-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-MAD-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-MAD-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-MAD-NEXT: ; %bb.1: ; %bb3 ; GFX10-MAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1394,7 +1434,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX10-MAD-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x48 +; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x48 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s0, v1, 0x42280000 @@ -1405,9 +1445,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-MAD-LABEL: madak_constant_bus_violation: ; GFX11-MAD: ; %bb.0: ; %bb -; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-MAD-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-MAD-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX11-MAD-NEXT: ; %bb.1: ; %bb3 ; GFX11-MAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1416,7 +1456,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX11-MAD-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x48 +; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x48 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v1, s0, 0.5 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1430,9 +1470,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX940-FMA-LABEL: madak_constant_bus_violation: ; GFX940-FMA: ; %bb.0: ; %bb -; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: s_cmp_lg_u32 s2, 0 +; GFX940-FMA-NEXT: s_cmp_lg_u32 s0, 0 ; GFX940-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX940-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1441,7 +1481,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX940-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX940-FMA-NEXT: global_load_dword v0, v[0:1], off sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: s_load_dword s0, s[0:1], 0x48 +; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e64 v1, s0, 0.5 @@ -1452,9 +1492,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-FMA-LABEL: madak_constant_bus_violation: ; GFX10-FMA: ; %bb.0: ; %bb -; GFX10-FMA-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-FMA-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX10-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1463,7 +1503,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX10-FMA-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x48 +; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x42280000 @@ -1474,9 +1514,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-FMA-LABEL: madak_constant_bus_violation: ; GFX11-FMA: ; %bb.0: ; %bb -; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FMA-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX11-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1485,7 +1525,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX11-FMA-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x48 +; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x48 ; GFX11-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index c7a831185b83c..92536c2078514 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-LABEL: test: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x1c -; GFX9-NEXT: s_load_dword s8, s[4:5], 0x38 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x1c +; GFX9-NEXT: s_load_dword s5, s[6:7], 0x38 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s7, 0xffff -; GFX9-NEXT: s_mul_i32 s6, s6, s4 -; GFX9-NEXT: s_add_i32 s8, s8, s6 -; GFX9-NEXT: v_add_u32_e32 v0, s8, v0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_mul_i32 s10, s10, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s10 +; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -34,13 +34,13 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-LABEL: test: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s7, s[4:5], 0x1c -; GFX10-NEXT: s_load_dword s8, s[4:5], 0x38 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s4, s[6:7], 0x1c +; GFX10-NEXT: s_load_dword s5, s[6:7], 0x38 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s7, 0xffff -; GFX10-NEXT: s_mul_i32 s6, s6, s4 -; GFX10-NEXT: v_add3_u32 v0, s8, s6, v0 +; GFX10-NEXT: s_and_b32 s4, s4, 0xffff +; GFX10-NEXT: s_mul_i32 s10, s10, s4 +; GFX10-NEXT: v_add3_u32 v0, s5, s10, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 @@ -59,14 +59,16 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX11-LABEL: test: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x1c -; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x38 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x1c +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x38 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s15, s15, s4 -; GFX11-NEXT: v_add3_u32 v0, s5, s15, v0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s13, s13, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_add3_u32 v0, s5, s13, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll index 2b5d32fa7b977..e929da796de6d 100644 --- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll +++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX10-LABEL: long_store_chain: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 @@ -91,7 +91,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; ; GFX11-LABEL: long_store_chain: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; ; GFX12-LABEL: long_store_chain: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 @@ -397,7 +397,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; GFX10-LABEL: long_load_chain: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3e ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -670,7 +670,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; ; GFX11-LABEL: long_load_chain: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -944,7 +944,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; ; GFX12-LABEL: long_load_chain: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1f ; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index 8ef2ca2765e8a..a8139cc6bc4c9 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -28,12 +28,12 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imax_sge_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -54,8 +54,8 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -78,12 +78,12 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -104,8 +104,8 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -139,19 +139,19 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-NEXT: global_load_dword v3, v0, s[0:1] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v0, s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4 +; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -175,8 +175,8 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -202,12 +202,12 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 @@ -229,8 +229,8 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sgt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -251,12 +251,12 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imax_sgt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -277,8 +277,8 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_uge_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -299,12 +299,12 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umax_uge_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -325,8 +325,8 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_ugt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -347,12 +347,12 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umax_ugt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -372,8 +372,8 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_ugt_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -396,12 +396,12 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umax_ugt_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index bef9ff82aa396..4fb90bbc46a8f 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -5,23 +5,23 @@ define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s6, s[6:7], 0x0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_i32_e32 v0, s6, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_max_i32_e32 v0, s2, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_i32: @@ -58,26 +58,26 @@ define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_i32_e32 v3, s11, v3 -; SI-NEXT: v_max_i32_e32 v2, s10, v2 -; SI-NEXT: v_max_i32_e32 v1, s9, v1 -; SI-NEXT: v_max_i32_e32 v0, s8, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_max_i32_e32 v3, s7, v3 +; SI-NEXT: v_max_i32_e32 v2, s6, v2 +; SI-NEXT: v_max_i32_e32 v1, s5, v1 +; SI-NEXT: v_max_i32_e32 v0, s4, v0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_v4i32: @@ -116,7 +116,7 @@ define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_imax_sge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -146,8 +146,8 @@ define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_test_imax_sge_imm_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -175,24 +175,24 @@ define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_sbyte v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_i8: @@ -240,8 +240,8 @@ define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_test_imax_sgt_imm_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -269,7 +269,7 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: s_test_imax_sgt_imm_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -303,23 +303,23 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sgt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s6, s[6:7], 0x0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_i32_e32 v0, s6, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_max_i32_e32 v0, s2, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sgt_i32: @@ -355,7 +355,7 @@ define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_imax_sgt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -385,23 +385,23 @@ define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_uge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s6, s[6:7], 0x0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_u32_e32 v0, s6, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_max_u32_e32 v0, s2, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_uge_i32: @@ -437,7 +437,7 @@ define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_umax_uge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -467,20 +467,20 @@ define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32> %a, <3 x i32> %b) nounwind { ; SI-LABEL: s_test_umax_uge_v3i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_max_u32 s6, s6, s10 -; SI-NEXT: s_max_u32 s5, s5, s9 -; SI-NEXT: s_max_u32 s4, s4, s8 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; SI-NEXT: s_max_u32 s2, s6, s10 +; SI-NEXT: s_max_u32 s0, s5, s9 +; SI-NEXT: s_max_u32 s1, s4, s8 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: s_test_umax_uge_v3i32: @@ -507,24 +507,24 @@ define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32 define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_uge_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_uge_i8: @@ -565,20 +565,20 @@ define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_ugt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dword s4, s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s0, s[4:5], 0x0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_u32_e32 v0, s4, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_max_u32_e32 v0, s0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_ugt_i32: @@ -614,7 +614,7 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_umax_ugt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -644,7 +644,7 @@ define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: s_test_umax_ugt_imm_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -680,9 +680,9 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind { ; SI-LABEL: simplify_demanded_bits_test_umax_ugt_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 -; SI-NEXT: s_load_dword s5, s[0:1], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s5, s[2:3], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -727,9 +727,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspac define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind { ; SI-LABEL: simplify_demanded_bits_test_max_slt_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 -; SI-NEXT: s_load_dword s5, s[0:1], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s5, s[2:3], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -773,9 +773,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind { ; SI-LABEL: s_test_imax_sge_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 -; SI-NEXT: s_load_dword s5, s[0:1], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s5, s[2:3], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -826,8 +826,8 @@ define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_umax_ugt_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -868,8 +868,8 @@ define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_umax_uge_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -910,8 +910,8 @@ define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_imax_sgt_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -952,8 +952,8 @@ define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_imax_sge_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index ae1f31272a15f..0a76e169e9c38 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 { ; CHECK-LABEL: memcpy_p0_p0_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -121,7 +121,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 { ; CHECK-LABEL: memcpy_p1_p1_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 @@ -145,7 +145,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p1_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] @@ -181,12 +181,12 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p5_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] -; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] +; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_u32 s8, s8, s7 +; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 @@ -206,52 +206,52 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 -; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:15 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:14 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:13 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:12 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:11 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:10 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:9 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:8 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:7 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:5 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:31 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:3 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 @@ -262,229 +262,229 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:23 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:22 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:21 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:20 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:19 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:29 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:18 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:17 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:27 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:26 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:25 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:24 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:44 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:43 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:45 +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:36 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:35 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:47 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:34 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:28 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:42 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:32 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:61 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:40 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:39 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:38 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:37 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:57 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:56 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:58 +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:49 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:48 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:46 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:41 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:55 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:74 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:53 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:52 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:51 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:63 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:50 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:77 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:71 +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:69 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:59 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:73 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:54 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:68 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:66 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:65 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:64 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:62 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:76 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:90 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:72 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:87 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:67 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:79 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:95 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:93 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:75 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:89 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:78 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:94 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:92 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:88 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:91 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:82 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 @@ -492,13 +492,13 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:100 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 @@ -506,54 +506,54 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:107 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:106 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:102 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:101 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:104 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 ; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:96 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:120 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:116 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:114 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:113 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:112 +; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -563,32 +563,32 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 { ; CHECK-LABEL: memcpy_p0_p5_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] -; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 -; CHECK-NEXT: s_add_u32 s8, s8, s7 -; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] +; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 +; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:2 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -601,287 +601,287 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:23 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:22 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:21 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:20 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:19 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:18 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:17 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:47 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:45 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:37 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:36 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:35 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:34 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:33 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:32 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:29 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:44 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:63 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:42 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:40 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:39 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:38 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:41 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:59 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:51 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:50 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:49 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:48 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:46 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:61 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:43 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:58 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:79 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:56 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:54 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:53 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:52 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:55 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:73 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:65 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:64 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:62 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:77 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:75 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:57 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:72 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:95 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:70 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:68 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:67 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:66 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:69 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:87 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:111 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:110 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:76 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:91 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:74 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:89 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:71 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:86 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:84 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:83 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:81 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:80 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:78 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:93 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:82 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:101 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:90 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:105 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:88 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:103 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:85 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:100 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:92 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:107 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:104 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:102 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:99 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:94 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:109 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:106 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:108 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:96 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:120 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:121 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:124 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 @@ -891,20 +891,20 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:115 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:125 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:112 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 +; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 @@ -935,7 +935,7 @@ entry: define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p3_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] @@ -971,7 +971,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-LABEL: memcpy_p0_p3_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 ; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 @@ -1254,7 +1254,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK-LABEL: memcpy_p0_p0_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -1367,7 +1367,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 { ; CHECK-LABEL: memcpy_p1_p1_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 @@ -1391,7 +1391,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p1_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] @@ -1427,12 +1427,12 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p5_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] -; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] +; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_u32 s8, s8, s7 +; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 @@ -1452,52 +1452,52 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 -; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:15 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:14 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:13 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:12 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:11 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:10 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:9 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:8 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:7 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:5 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:31 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:3 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 @@ -1508,229 +1508,229 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:23 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:22 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:21 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:20 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:19 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:29 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:18 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:17 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:27 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:26 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:25 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:24 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:44 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:43 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:45 +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:36 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:35 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:47 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:34 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:28 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:42 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:32 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:61 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:40 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:39 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:38 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:37 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:57 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:56 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:58 +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:49 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:48 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:46 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:41 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:55 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:74 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:53 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:52 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:51 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:63 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:50 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:77 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:71 +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:69 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:59 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:73 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:54 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:68 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:66 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:65 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:64 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:62 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:76 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:90 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:72 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:87 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:67 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:79 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:95 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:93 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:75 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:89 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:78 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:94 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:92 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:88 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:91 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:82 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 @@ -1738,13 +1738,13 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:100 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 @@ -1752,54 +1752,54 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:107 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:106 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:102 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:101 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:104 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 ; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:96 +; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 +; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 +; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 +; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:120 +; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 +; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 +; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 +; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 +; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 +; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 +; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119 +; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:116 +; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 +; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 +; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:114 +; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:113 +; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:112 +; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -1809,32 +1809,32 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 { ; CHECK-LABEL: memcpy_p0_p5_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] -; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 -; CHECK-NEXT: s_add_u32 s8, s8, s7 -; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] +; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 +; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:2 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -1847,287 +1847,287 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:23 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:22 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:21 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:20 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:19 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:18 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:17 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:47 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:45 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:37 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:36 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:35 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:34 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:33 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:32 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:29 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:44 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:63 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:42 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:40 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:39 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:38 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:41 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:59 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:51 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:50 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:49 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:48 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:46 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:61 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:43 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:58 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:79 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:56 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:54 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:53 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:52 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:55 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:73 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:65 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:64 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:62 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:77 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:75 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:57 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:72 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:95 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:70 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:68 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:67 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:66 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:69 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:87 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:111 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:110 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:76 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:91 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:74 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:89 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:71 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:86 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:84 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:83 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:81 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:80 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:78 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:93 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:82 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:101 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:90 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:105 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:88 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:103 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:85 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:100 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:92 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:107 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:104 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:102 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:99 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:94 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:109 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:106 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:108 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:96 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:120 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:121 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:124 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 @@ -2137,20 +2137,20 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:115 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:125 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:112 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 +; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 @@ -2181,7 +2181,7 @@ entry: define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p3_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] @@ -2217,7 +2217,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-LABEL: memcpy_p0_p3_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 ; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll index f60728c16a3ae..3a6d8ca1e35f6 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll @@ -9,7 +9,7 @@ define void @memcpy_p1_p4_sz16_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz16_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -26,7 +26,7 @@ define void @memcpy_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz31_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s8 @@ -34,7 +34,7 @@ define void @memcpy_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-NEXT: v_mov_b32_e32 v4, s10 ; CHECK-NEXT: v_mov_b32_e32 v5, s11 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_load_dwordx4 v[2:5], v6, s[4:5] offset:15 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v6, s[6:7] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -47,7 +47,7 @@ define void @memcpy_p1_p4_sz32_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz32_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll index 1b8483a54bb3b..b32bfd0e495ba 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll @@ -9,7 +9,7 @@ define void @memmove_p1_p4_sz16_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK-LABEL: memmove_p1_p4_sz16_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -27,8 +27,8 @@ define void @memmove_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: global_load_ubyte v9, v2, s[4:5] offset:30 -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; CHECK-NEXT: global_load_ubyte v9, v2, s[6:7] offset:30 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -53,7 +53,7 @@ define void @memmove_p1_p4_sz32_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK-LABEL: memmove_p1_p4_sz32_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s8 ; CHECK-NEXT: v_mov_b32_e32 v3, s9 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index c49e0501665c5..3a065d518f0a9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] @@ -24,7 +24,7 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture read ; ; GCN-SCRATCH-LABEL: vector_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x3 @@ -69,7 +69,7 @@ bb: define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture read ; ; GCN-SCRATCH-LABEL: scalar_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -250,11 +250,11 @@ bb: define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readnone %arg1, ptr addrspace(1) noalias nocapture %arg2) { ; GCN-LABEL: vector_clause_indirect: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] +; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 @@ -267,20 +267,20 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap ; ; GCN-SCRATCH-LABEL: vector_clause_indirect: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3] +; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x1 ; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[4:5], off ; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 ; GCN-SCRATCH-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -384,10 +384,10 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s18, -1 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 ; GCN-NEXT: s_mov_b32 s19, 0xe00000 -; GCN-NEXT: s_add_u32 s16, s16, s3 +; GCN-NEXT: s_add_u32 s16, s16, s9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 @@ -411,13 +411,13 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; ; GCN-SCRATCH-LABEL: flat_scratch_load: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 -; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 +; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 -; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x44 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1 ; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8 @@ -453,22 +453,22 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32> %desc) { ; GCN-LABEL: flat_scratch_load_clause: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xe00000 -; GCN-NEXT: s_add_u32 s4, s4, s3 -; GCN-NEXT: s_addc_u32 s5, s5, 0 +; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: s_mov_b32 s15, 0xe00000 +; GCN-NEXT: s_add_u32 s12, s12, s9 +; GCN-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0x40d00000 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: exp mrt0 v0, off, off, off done vm @@ -476,10 +476,10 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 ; ; GCN-SCRATCH-LABEL: flat_scratch_load_clause: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 -; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 +; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40d00000 ; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off diff --git a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll index 7bb09f6697b68..9c2b437a08f08 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll @@ -18,7 +18,7 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16: ; GCN: v_mfma_f32_32x32x2bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %a = bitcast i32 1 to <2 x i16> @@ -30,7 +30,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16: ; GCN: v_mfma_f32_16x16x2bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -40,7 +40,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16: ; GCN: v_mfma_f32_4x4x2bf16 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -50,7 +50,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16: ; GCN: v_mfma_f32_32x32x4bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -60,7 +60,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16: ; GCN: v_mfma_f32_16x16x8bf16 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -70,7 +70,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k: ; GCN: v_mfma_f32_32x32x4bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -80,7 +80,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k: ; GCN: v_mfma_f32_16x16x4bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -90,7 +90,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k: ; GCN: v_mfma_f32_4x4x4bf16_1k v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -100,7 +100,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k: ; GCN: v_mfma_f32_32x32x8bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -110,7 +110,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k: ; GCN: v_mfma_f32_16x16x16bf16_1k v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -120,7 +120,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64: ; GCN: v_mfma_f64_4x4x4f64 v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+:[0-9]+}} -define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg) #0 { bb: %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double 1.0, double 1.0, double 128.0, i32 0, i32 0, i32 0) store double %mai.1, ptr addrspace(1) %arg @@ -129,7 +129,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64: ; GCN: v_mfma_f64_16x16x4f64 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x double>, ptr addrspace(1) %arg %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double 1.0, double 1.0, <4 x double> %in.1, i32 0, i32 0, i32 0) @@ -139,7 +139,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8: ; GCN: v_mfma_i32_32x32x8i8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 1, <16 x i32> %in.1, i32 0, i32 0, i32 0) @@ -149,10 +149,12 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8: ; GCN: v_mfma_i32_16x16x16i8 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 1, <4 x i32> %in.1, i32 0, i32 0, i32 0) store <4 x i32> %mai.1, ptr addrspace(1) %arg ret void } + +attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index ba34c1bbe1d71..e0708a55f438b 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -19,7 +19,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_agpr: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #1 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #2 { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -29,7 +29,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) { bb: %acc = call i32 asm sideeffect "; def $0", "={a0}"() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -40,7 +40,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) { bb: call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> undef) %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -63,7 +63,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #1 { bb: call void @foo() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -78,7 +78,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call_multi_bb: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #1 { bb1: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) @@ -106,5 +106,6 @@ bb: declare void @foo() -attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } -attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "amdgpu-no-agpr" } +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } +attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll index 59b13c02f92fb..b48152dad99ac 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll @@ -30,7 +30,7 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i3 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8: ; GCN: v_mfma_i32_16x16x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 0, i32 0, i32 0) @@ -40,7 +40,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8: ; GCN: v_mfma_i32_32x32x16_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 0, i32 0, i32 0) @@ -50,7 +50,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32: ; GCN: v_mfma_f32_16x16x8_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> , <2 x float> , <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -60,7 +60,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32: ; GCN: v_mfma_f32_32x32x4_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> , <2 x float> , <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -70,7 +70,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_bf8: ; GCN: v_mfma_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -80,7 +80,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_fp8: ; GCN: v_mfma_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -90,7 +90,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_bf8: ; GCN: v_mfma_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -100,7 +100,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_fp8: ; GCN: v_mfma_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -110,7 +110,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_bf8: ; GCN: v_mfma_f32_32x32x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -120,7 +120,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_fp8: ; GCN: v_mfma_f32_32x32x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -130,7 +130,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_bf8: ; GCN: v_mfma_f32_32x32x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -140,7 +140,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_fp8: ; GCN: v_mfma_f32_32x32x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -150,7 +150,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16: ; GCN: v_smfmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -160,7 +160,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_f16: ; GCN: v_smfmac_f32_32x32x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -170,7 +170,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_bf16: ; GCN: v_smfmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -180,7 +180,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_bf16: ; GCN: v_smfmac_f32_32x32x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -190,7 +190,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_i8: ; GCN: v_smfmac_i32_16x16x64_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 0, i32 0) @@ -200,7 +200,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_i8: ; GCN: v_smfmac_i32_32x32x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 0, i32 0) @@ -210,7 +210,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8: ; GCN: v_smfmac_f32_16x16x64_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -220,7 +220,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8: ; GCN: v_smfmac_f32_16x16x64_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -230,7 +230,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8: ; GCN: v_smfmac_f32_16x16x64_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -240,7 +240,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8: ; GCN: v_smfmac_f32_16x16x64_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -250,7 +250,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8: ; GCN: v_smfmac_f32_32x32x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -260,7 +260,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8: ; GCN: v_smfmac_f32_32x32x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -270,7 +270,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8: ; GCN: v_smfmac_f32_32x32x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -280,10 +280,12 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8: ; GCN: v_smfmac_f32_32x32x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } + +attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll index 06775f5d3f92b..bffd15872c42c 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll @@ -19,7 +19,7 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: ; GCN: v_mfma_f32_32x32x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -29,7 +29,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: ; GCN: v_mfma_f32_16x16x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -39,7 +39,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: ; GCN: v_mfma_f32_4x4x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -49,7 +49,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32: ; GCN: v_mfma_f32_32x32x2{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -59,7 +59,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32: ; GCN: v_mfma_f32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -69,7 +69,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16: ; GCN: v_mfma_f32_32x32x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> undef, <4 x half> undef, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -79,7 +79,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16: ; GCN: v_mfma_f32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> undef, <4 x half> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -89,7 +89,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16: ; GCN: v_mfma_f32_4x4x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> undef, <4 x half> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -99,7 +99,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16: ; GCN: v_mfma_f32_32x32x8{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> undef, <4 x half> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -109,7 +109,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16: ; GCN: v_mfma_f32_16x16x16{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> undef, <4 x half> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -119,7 +119,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8: ; GCN: v_mfma_i32_32x32x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <32 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 1, <32 x i32> %in.1, i32 0, i32 0, i32 0) @@ -129,7 +129,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8: ; GCN: v_mfma_i32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 1, <16 x i32> %in.1, i32 0, i32 0, i32 0) @@ -139,10 +139,12 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8: ; GCN: v_mfma_i32_4x4x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 1, <4 x i32> %in.1, i32 0, i32 0, i32 0) store <4 x i32> %mai.1, ptr addrspace(1) %arg ret void } + +attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 9dafa27ece86f..a77892c8f5fc7 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -31,8 +31,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -53,8 +53,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -75,12 +75,12 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -89,13 +89,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_sle_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -104,8 +104,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_sle_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -143,7 +145,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -154,7 +156,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -165,7 +167,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -175,7 +177,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_sle_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -185,7 +187,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_sle_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -215,7 +217,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_imin_sle_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -226,7 +228,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_imin_sle_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -237,7 +239,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -247,7 +249,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_imin_sle_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -257,7 +259,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_imin_sle_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -290,8 +292,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; CI-LABEL: s_test_imin_sle_v4i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s11, s15 ; CI-NEXT: s_min_i32 s3, s10, s14 @@ -308,8 +310,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: s_test_imin_sle_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s11, s15 ; VI-NEXT: s_min_i32 s3, s10, s14 @@ -326,8 +328,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s11, s15 @@ -344,8 +346,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX10-LABEL: s_test_imin_sle_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s11, s15 @@ -362,8 +364,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX11-LABEL: s_test_imin_sle_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s7, s11 @@ -417,9 +419,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; CI-LABEL: s_test_imin_sle_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0xa -; CI-NEXT: s_load_dword s3, s[4:5], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0xa +; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 @@ -432,9 +434,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; VI-LABEL: s_test_imin_sle_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x28 -; VI-NEXT: s_load_dword s3, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x28 +; VI-NEXT: s_load_dword s3, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 @@ -447,9 +449,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; GFX9-LABEL: s_test_imin_sle_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -462,9 +464,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX10-LABEL: s_test_imin_sle_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i8 s2, s2 @@ -477,13 +479,13 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX11-LABEL: s_test_imin_sle_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i8 s2, s2 -; GFX11-NEXT: s_sext_i32_i8 s3, s3 +; GFX11-NEXT: s_sext_i32_i8 s2, s4 +; GFX11-NEXT: s_sext_i32_i8 s3, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -554,9 +556,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; CI-LABEL: s_test_imin_sle_v4i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0xa -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dword s2, s[6:7], 0xa +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s3, s[6:7], 0x13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 24 ; CI-NEXT: s_sext_i32_i8 s5, s2 @@ -587,9 +589,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; VI-LABEL: s_test_imin_sle_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x28 -; VI-NEXT: s_load_dword s3, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x28 +; VI-NEXT: s_load_dword s3, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2 ; VI-NEXT: v_lshrrev_b16_e64 v1, 8, s3 @@ -616,9 +618,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX9-LABEL: s_test_imin_sle_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 @@ -644,9 +646,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX10-LABEL: s_test_imin_sle_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 @@ -673,29 +675,27 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX11-LABEL: s_test_imin_sle_v4i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x28 +; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x4c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: s_lshr_b32 s5, s3, 16 -; GFX11-NEXT: v_ashrrev_i16 v0, 8, s2 -; GFX11-NEXT: v_ashrrev_i16 v1, 8, s3 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-NEXT: v_ashrrev_i16 v0, 8, s0 +; GFX11-NEXT: v_ashrrev_i16 v1, 8, s1 ; GFX11-NEXT: v_ashrrev_i16 v2, 8, s4 ; GFX11-NEXT: v_ashrrev_i16 v3, 8, s5 -; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000 +; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000 ; GFX11-NEXT: s_bfe_i32 s4, s4, 0x80000 ; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX11-NEXT: v_min_i16 v4, s2, s3 +; GFX11-NEXT: v_min_i16 v4, s0, s1 ; GFX11-NEXT: v_min_i16 v5, s4, s5 ; GFX11-NEXT: v_min_i16 v2, v2, v3 ; GFX11-NEXT: v_min_i16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -707,6 +707,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -751,7 +752,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; CI-LABEL: s_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 @@ -770,7 +771,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; VI-LABEL: s_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -789,7 +790,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -799,7 +800,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX10-LABEL: s_test_imin_sle_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s2, s3 @@ -808,7 +809,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX11-LABEL: s_test_imin_sle_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s2, s3 @@ -903,8 +904,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s6, s0, 16 ; CI-NEXT: s_ashr_i32 s7, s1, 16 @@ -933,8 +934,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; VI-LABEL: s_test_imin_sle_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s6, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 @@ -963,34 +964,34 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_pk_min_i16 v1, s1, v0 ; GFX9-NEXT: v_pk_min_i16 v0, s0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_sle_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s1, s3 ; GFX10-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_sle_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s5, s7 @@ -1030,8 +1031,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1052,8 +1053,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1074,12 +1075,12 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1088,13 +1089,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -1103,8 +1104,10 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1169,8 +1172,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1191,8 +1194,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1213,12 +1216,12 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -1227,13 +1230,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i16 v1, v1, v2 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] @@ -1242,8 +1245,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1282,7 +1287,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1293,7 +1298,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1304,7 +1309,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -1314,7 +1319,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_slt_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -1324,7 +1329,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_slt_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -1355,8 +1360,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; CI-LABEL: s_test_imin_slt_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 @@ -1369,8 +1374,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: s_test_imin_slt_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 @@ -1383,36 +1388,36 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; GFX9-LABEL: s_test_imin_slt_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s1, s1, s3 ; GFX9-NEXT: s_min_i32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_slt_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s0, s0, s2 ; GFX10-NEXT: s_min_i32 s1, s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_slt_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s4, s6 @@ -1443,8 +1448,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_slt_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1455,8 +1460,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_slt_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1467,8 +1472,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_slt_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1479,8 +1484,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_slt_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1491,11 +1496,11 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_slt_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s2, 8 +; GFX11-NEXT: s_min_i32 s2, s4, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1522,8 +1527,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_sle_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1534,8 +1539,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_sle_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1546,8 +1551,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_sle_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1558,8 +1563,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_sle_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1570,11 +1575,11 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_sle_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s2, 8 +; GFX11-NEXT: s_min_i32 s2, s4, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1612,8 +1617,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1634,8 +1639,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1656,12 +1661,12 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1670,13 +1675,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ule_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -1685,8 +1690,10 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ule_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1741,8 +1748,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1765,8 +1772,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1789,12 +1796,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7] +; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v4 @@ -1805,13 +1812,13 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] -; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7] +; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v4 @@ -1822,8 +1829,10 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1902,8 +1911,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1938,8 +1947,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1966,12 +1975,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_min_u16 v0, v0, v2 @@ -1982,13 +1991,13 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX10-NEXT: v_pk_min_u16 v0, v0, v2 @@ -1999,8 +2008,10 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2042,7 +2053,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2053,7 +2064,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2064,7 +2075,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2074,7 +2085,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ule_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2084,7 +2095,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ule_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2125,8 +2136,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2147,8 +2158,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2169,12 +2180,12 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -2183,13 +2194,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ult_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -2198,8 +2209,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ult_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2255,8 +2268,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; CI-LABEL: v_test_umin_ult_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 @@ -2276,8 +2289,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: v_test_umin_ult_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 @@ -2297,11 +2310,11 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_test_umin_ult_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] @@ -2310,12 +2323,12 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX10-LABEL: v_test_umin_ult_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u16 v1, v1, v2 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] @@ -2324,8 +2337,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX11-LABEL: v_test_umin_ult_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u8 v1, v0, s[6:7] @@ -2363,7 +2377,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2374,7 +2388,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2385,7 +2399,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2395,7 +2409,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ult_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2405,7 +2419,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ult_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2457,7 +2471,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i32_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2478,7 +2492,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i32_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2499,7 +2513,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i32_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0 @@ -2517,7 +2531,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX10-LABEL: v_test_umin_ult_i32_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0 @@ -2535,7 +2549,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX11-LABEL: v_test_umin_ult_i32_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 @@ -2607,7 +2621,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i16_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2629,7 +2643,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i16_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2651,7 +2665,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i16_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -2666,7 +2680,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX10-LABEL: v_test_umin_ult_i16_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -2682,7 +2696,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX11-LABEL: v_test_umin_ult_i16_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2721,7 +2735,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_umin_ult_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2732,7 +2746,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_umin_ult_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2743,7 +2757,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2753,7 +2767,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_umin_ult_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2763,7 +2777,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_umin_ult_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2804,8 +2818,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s4, s11, s19 ; CI-NEXT: s_min_u32 s5, s10, s18 @@ -2835,8 +2849,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s4, s11, s19 ; VI-NEXT: s_min_u32 s5, s10, s18 @@ -2866,8 +2880,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v8i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s4, s9, s17 @@ -2894,8 +2908,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX10-LABEL: s_test_umin_ult_v8i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s4, s9, s17 @@ -2921,8 +2935,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX11-LABEL: s_test_umin_ult_v8i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x20 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x20 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s7, s15 @@ -3095,8 +3109,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; CI-LABEL: s_test_umin_ult_v8i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s2, s8, 16 ; CI-NEXT: s_and_b32 s3, s8, 0xffff @@ -3141,8 +3155,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; VI-LABEL: s_test_umin_ult_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s11, 16 ; VI-NEXT: s_lshr_b32 s4, s10, 16 @@ -3187,8 +3201,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; GFX9-LABEL: s_test_umin_ult_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s15 @@ -3205,8 +3219,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX10-LABEL: s_test_umin_ult_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v3, s11, s15 @@ -3219,8 +3233,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX11-LABEL: s_test_umin_ult_v8i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_u16 v3, s7, s11 @@ -3263,9 +3277,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0xa -; CI-NEXT: s_load_dword s3, s[4:5], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0xa +; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3278,9 +3292,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x28 -; VI-NEXT: s_load_dword s3, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x28 +; VI-NEXT: s_load_dword s3, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3293,9 +3307,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -3308,9 +3322,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff @@ -3323,13 +3337,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-NEXT: s_and_b32 s3, s5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3372,9 +3386,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; CI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0xa -; CI-NEXT: s_load_dword s3, s[4:5], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0xa +; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 @@ -3387,9 +3401,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; VI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x28 -; VI-NEXT: s_load_dword s3, s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x28 +; VI-NEXT: s_load_dword s3, s[6:7], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 @@ -3402,9 +3416,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -3417,9 +3431,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s2 @@ -3432,13 +3446,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s2 -; GFX11-NEXT: s_sext_i32_i16 s3, s3 +; GFX11-NEXT: s_sext_i32_i16 s2, s4 +; GFX11-NEXT: s_sext_i32_i16 s3, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3489,8 +3503,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; CI-LABEL: s_test_imin_sle_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s3, s2 ; CI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3503,8 +3517,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; VI-LABEL: s_test_imin_sle_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s3, s2 ; VI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3517,8 +3531,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; GFX9-LABEL: s_test_imin_sle_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s3, s2 @@ -3531,8 +3545,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX10-LABEL: s_test_imin_sle_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s3, s2 @@ -3545,14 +3559,14 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX11-LABEL: s_test_imin_sle_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s3, s2 -; GFX11-NEXT: s_ashr_i32 s2, s2, 16 +; GFX11-NEXT: s_sext_i32_i16 s2, s4 +; GFX11-NEXT: s_ashr_i32 s3, s4, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s3, s2 +; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3585,8 +3599,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ult_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3603,8 +3617,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ult_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3621,16 +3635,16 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ult_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s5 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3639,14 +3653,14 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ult_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7] -; GFX10-NEXT: s_and_b32 s4, s4, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5] +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s3, s3, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3655,8 +3669,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ult_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[6:7], s[0:1] @@ -3695,8 +3709,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ule_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3713,8 +3727,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ule_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3731,16 +3745,16 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ule_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s5 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3749,14 +3763,14 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ule_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_le_u64_e64 s4, s[2:3], s[6:7] -; GFX10-NEXT: s_and_b32 s4, s4, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5] +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s3, s3, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3765,8 +3779,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ule_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s2, s[6:7], s[0:1] @@ -3805,8 +3819,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_slt_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3823,8 +3837,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_slt_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3841,16 +3855,16 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_slt_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s5 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3859,14 +3873,14 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_slt_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] -; GFX10-NEXT: s_and_b32 s4, s4, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5] +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s3, s3, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3875,8 +3889,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_slt_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[0:1] @@ -3915,8 +3929,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_sle_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3933,8 +3947,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_sle_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3951,16 +3965,16 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_sle_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s5 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3969,14 +3983,14 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_sle_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_le_i64_e64 s4, s[2:3], s[6:7] -; GFX10-NEXT: s_and_b32 s4, s4, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5] +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s3, s3, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3985,8 +3999,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_sle_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s2, s[6:7], s[0:1] @@ -4048,8 +4062,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4079,8 +4093,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4103,12 +4117,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -4117,13 +4131,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_sle_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -4132,8 +4146,10 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_sle_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -4198,8 +4214,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_ule_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4228,8 +4244,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_ule_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4252,12 +4268,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_ule_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -4266,13 +4282,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_ule_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -4281,8 +4297,10 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_ule_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll index 46036256780ba..27b71dd471a83 100644 --- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll +++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll @@ -28,96 +28,128 @@ store i32 0, ptr addrspace(3) @used_by_kernel define amdgpu_kernel void @withcall() { ; GFX9-LABEL: withcall: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s8, s0, 36 -; GFX9-NEXT: s_addc_u32 s9, s1, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[12:13] -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s9 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 36 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 +; GFX9-NEXT: ds_write_b32 v3, v3 offset:8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: withcall: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s14, -1 -; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-NEXT: s_add_u32 s12, s12, s3 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_add_u32 s8, s0, 36 -; GFX10-NEXT: s_addc_u32 s9, s1, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_mov_b64 s[0:1], s[12:13] -; GFX10-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX10-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s22, -1 +; GFX10-NEXT: s_mov_b32 s23, 0x31c16000 +; GFX10-NEXT: s_add_u32 s20, s20, s9 +; GFX10-NEXT: s_addc_u32 s21, s21, 0 +; GFX10-NEXT: s_mov_b32 s14, s8 +; GFX10-NEXT: s_add_u32 s8, s2, 36 +; GFX10-NEXT: s_addc_u32 s9, s3, 0 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX10-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] +; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX10-NEXT: s_mov_b32 s12, s6 +; GFX10-NEXT: s_mov_b32 s13, s7 +; GFX10-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX10-NEXT: s_mov_b32 s32, 0 -; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 +; GFX10-NEXT: ds_write_b32 v3, v3 offset:8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX10-NEXT: s_endpgm ; ; G_GFX9-LABEL: withcall: ; G_GFX9: ; %bb.0: -; G_GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; G_GFX9-NEXT: s_mov_b32 s14, -1 -; G_GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; G_GFX9-NEXT: s_add_u32 s12, s12, s3 -; G_GFX9-NEXT: s_addc_u32 s13, s13, 0 -; G_GFX9-NEXT: s_add_u32 s8, s0, 36 -; G_GFX9-NEXT: s_addc_u32 s9, s1, 0 +; G_GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; G_GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_mov_b32 s22, -1 +; G_GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; G_GFX9-NEXT: s_add_u32 s20, s20, s9 +; G_GFX9-NEXT: s_addc_u32 s21, s21, 0 +; G_GFX9-NEXT: s_mov_b32 s14, s8 +; G_GFX9-NEXT: s_add_u32 s8, s2, 36 +; G_GFX9-NEXT: s_addc_u32 s9, s3, 0 +; G_GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; G_GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; G_GFX9-NEXT: s_getpc_b64 s[0:1] ; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 ; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 -; G_GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; G_GFX9-NEXT: s_mov_b64 s[0:1], s[12:13] -; G_GFX9-NEXT: v_mov_b32_e32 v0, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, 8 -; G_GFX9-NEXT: s_mov_b64 s[2:3], s[14:15] +; G_GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] +; G_GFX9-NEXT: v_mov_b32_e32 v3, 0 +; G_GFX9-NEXT: v_mov_b32_e32 v4, 8 +; G_GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; G_GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] +; G_GFX9-NEXT: s_mov_b32 s12, s6 +; G_GFX9-NEXT: s_mov_b32 s13, s7 ; G_GFX9-NEXT: s_mov_b32 s32, 0 -; G_GFX9-NEXT: ds_write_b32 v1, v0 +; G_GFX9-NEXT: ds_write_b32 v4, v3 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; G_GFX9-NEXT: s_endpgm ; ; G_GFX10-LABEL: withcall: ; G_GFX10: ; %bb.0: -; G_GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; G_GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; G_GFX10-NEXT: s_mov_b32 s14, -1 -; G_GFX10-NEXT: s_mov_b32 s15, 0x31c16000 -; G_GFX10-NEXT: s_add_u32 s12, s12, s3 -; G_GFX10-NEXT: s_addc_u32 s13, s13, 0 -; G_GFX10-NEXT: s_add_u32 s8, s0, 36 -; G_GFX10-NEXT: s_addc_u32 s9, s1, 0 +; G_GFX10-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; G_GFX10-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; G_GFX10-NEXT: s_mov_b32 s22, -1 +; G_GFX10-NEXT: s_mov_b32 s23, 0x31c16000 +; G_GFX10-NEXT: s_add_u32 s20, s20, s9 +; G_GFX10-NEXT: s_addc_u32 s21, s21, 0 +; G_GFX10-NEXT: s_mov_b32 s14, s8 +; G_GFX10-NEXT: s_add_u32 s8, s2, 36 +; G_GFX10-NEXT: s_addc_u32 s9, s3, 0 +; G_GFX10-NEXT: s_mov_b64 s[10:11], s[4:5] +; G_GFX10-NEXT: s_mov_b64 s[4:5], s[0:1] ; G_GFX10-NEXT: s_getpc_b64 s[0:1] ; G_GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 ; G_GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 -; G_GFX10-NEXT: v_mov_b32_e32 v0, 0 -; G_GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, 8 -; G_GFX10-NEXT: s_mov_b64 s[0:1], s[12:13] -; G_GFX10-NEXT: s_mov_b64 s[2:3], s[14:15] +; G_GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; G_GFX10-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; G_GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; G_GFX10-NEXT: v_mov_b32_e32 v3, 0 +; G_GFX10-NEXT: v_mov_b32_e32 v4, 8 +; G_GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] +; G_GFX10-NEXT: s_mov_b64 s[2:3], s[22:23] +; G_GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 +; G_GFX10-NEXT: s_mov_b32 s12, s6 +; G_GFX10-NEXT: s_mov_b32 s13, s7 ; G_GFX10-NEXT: s_mov_b32 s32, 0 -; G_GFX10-NEXT: ds_write_b32 v1, v0 +; G_GFX10-NEXT: ds_write_b32 v4, v3 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] ; G_GFX10-NEXT: s_endpgm store i32 0, ptr addrspace(3) @used_by_both call void @nonkernel() diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll index 99120ab4a1424..1c38f8ffc89ed 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: add_reg_imm ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -27,9 +27,9 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) { define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: add_reg_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -50,9 +50,9 @@ define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_reg_imm ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -73,9 +73,9 @@ define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_imm_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -96,9 +96,9 @@ define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_reg_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_reg_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index 4332d9daeaaf5..eb638da390405 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -23,13 +23,13 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xf +; GCN-NEXT: s_load_dword s2, s[2:3], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -73,7 +73,7 @@ exit: define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32_noret: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -82,13 +82,13 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[4:7], 0 addr64 offset:400 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xf +; GCN-NEXT: s_load_dword s2, s[2:3], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: .LBB1_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll index 63688ebeab9d0..90a3d350e7416 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -23,10 +23,10 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %atomic -; GCN-NEXT: s_load_dword s0, s[0:1], 0xf +; GCN-NEXT: s_load_dword s0, s[2:3], 0xf ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -58,7 +58,7 @@ exit: define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32_noret: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -67,10 +67,10 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %atomic -; GCN-NEXT: s_load_dword s0, s[0:1], 0xf +; GCN-NEXT: s_load_dword s0, s[2:3], 0xf ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll index 9d6e0927b0dfd..ece7e28c763fb 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll @@ -8,7 +8,7 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -45,7 +45,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-LABEL: ctlz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -87,7 +87,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -125,7 +125,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; ; GFX10-LABEL: ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -168,7 +168,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -205,7 +205,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-LABEL: cttz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -249,7 +249,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -287,7 +287,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; ; GFX10-LABEL: cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll index 1cd9afef13b5e..4630b0d7ef50b 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: exp_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -24,9 +24,9 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: exp_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -45,9 +45,9 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: log_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -65,9 +65,9 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: log_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -86,9 +86,9 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rcp_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -106,9 +106,9 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rcp_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -127,9 +127,9 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rsq_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -147,9 +147,9 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rsq_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -168,9 +168,9 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sqrt_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -188,9 +188,9 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sqrt_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll index 4ba5f3abcb24b..4aed9dc2fca6c 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -5,49 +5,49 @@ ; Test addressing modes when the scratch base is not a frame index. ; GCN-LABEL: {{^}}store_private_offset_i8: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:8 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @store_private_offset_i8() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_i16: -; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], 0 offset:8 +; GCN: buffer_store_short v{{[0-9]+}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @store_private_offset_i16() #0 { store volatile i16 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_i32: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @store_private_offset_i32() #0 { store volatile i32 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_v2i32: -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @store_private_offset_v2i32() #0 { store volatile <2 x i32> , ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_v4i32: -; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @store_private_offset_v4i32() #0 { store volatile <4 x i32> , ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}load_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @load_private_offset_i8() #0 { %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i8: -; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 +; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @sextload_private_offset_i8(ptr addrspace(1) %out) #0 { %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %sextload = sext i8 %load to i32 @@ -56,7 +56,7 @@ define amdgpu_kernel void @sextload_private_offset_i8(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}zextload_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @zextload_private_offset_i8(ptr addrspace(1) %out) #0 { %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %zextload = zext i8 %load to i32 @@ -65,14 +65,14 @@ define amdgpu_kernel void @zextload_private_offset_i8(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}load_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @load_private_offset_i16() #0 { %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i16: -; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], 0 offset:8 +; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @sextload_private_offset_i16(ptr addrspace(1) %out) #0 { %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %sextload = sext i16 %load to i32 @@ -81,7 +81,7 @@ define amdgpu_kernel void @sextload_private_offset_i16(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}zextload_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @zextload_private_offset_i16(ptr addrspace(1) %out) #0 { %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %zextload = zext i16 %load to i32 @@ -90,28 +90,28 @@ define amdgpu_kernel void @zextload_private_offset_i16(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}load_private_offset_i32: -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @load_private_offset_i32() #0 { %load = load volatile i32, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}load_private_offset_v2i32: -; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @load_private_offset_v2i32() #0 { %load = load volatile <2 x i32>, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}load_private_offset_v4i32: -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 define amdgpu_kernel void @load_private_offset_v4i32() #0 { %load = load volatile <4 x i32>, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:4095 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[12:15], 0 offset:4095 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 4095 to ptr addrspace(5)) ret void @@ -119,7 +119,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[12:15], 0 offen{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 4096 to ptr addrspace(5)) ret void @@ -127,7 +127,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen offset:1{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[12:15], 0 offen offset:1{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 4097 to ptr addrspace(5)) ret void diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index b4272049f36a4..0889f8ef6316e 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_mul_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_mul_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -50,7 +50,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_mul_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -69,7 +69,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: test_mul_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_mul_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -109,7 +109,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_mul_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -157,7 +157,7 @@ entry: define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -179,7 +179,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_mul_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -201,7 +201,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_mul_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -223,7 +223,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_mul_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -246,7 +246,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_mul_v4i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -271,7 +271,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_mul_v4i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -326,9 +326,9 @@ entry: define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: s_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -341,9 +341,9 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; VI-LABEL: s_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[0:1], 0x34 +; VI-NEXT: s_load_dword s7, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -356,10 +356,10 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; GFX9-LABEL: s_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 -; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX9-NEXT: s_load_dword s7, s[2:3], 0x34 +; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 @@ -373,11 +373,11 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX10-LABEL: s_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mul_i32 s0, s2, s6 +; GFX10-NEXT: s_mul_i32 s0, s0, s6 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -386,8 +386,8 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX11-LABEL: s_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mul_i32 s0, s0, s6 @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX12-LABEL: s_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mul_i32 s0, s0, s6 @@ -433,98 +433,98 @@ entry: define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s3 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s2 -; GFX10-NEXT: s_mov_b32 s15, s3 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s10 +; GFX10-NEXT: s_mov_b32 s15, s11 +; GFX10-NEXT: s_mov_b32 s2, s10 +; GFX10-NEXT: s_mov_b32 s3, s11 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_mov_b32 s9, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -548,8 +548,8 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX12-LABEL: v_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s14, s10 @@ -603,8 +603,8 @@ entry: define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,11 +617,11 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 +; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s4, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -630,43 +630,43 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s0, s2, 0x50 -; GFX9-NEXT: s_mulk_i32 s2, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mul_hi_i32 s5, s4, 0x50 +; GFX9-NEXT: s_mulk_i32 s4, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_sext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 -; GFX10-NEXT: s_mul_hi_i32 s1, s2, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 +; GFX10-NEXT: s_mul_hi_i32 s3, s4, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_sext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 -; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 +; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 +; GFX11-NEXT: s_mul_hi_i32 s3, s4, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -676,7 +676,7 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -711,8 +711,8 @@ entry: define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -725,11 +725,11 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -738,43 +738,43 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s0, s2, 0x50 -; GFX9-NEXT: s_mulk_i32 s2, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, 0x50 +; GFX9-NEXT: s_mulk_i32 s4, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_zext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 -; GFX10-NEXT: s_mul_hi_u32 s1, s2, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 +; GFX10-NEXT: s_mul_hi_u32 s3, s4, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_zext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 -; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50 +; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 +; GFX11-NEXT: s_mul_hi_u32 s3, s4, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -784,7 +784,7 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 @@ -818,7 +818,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -838,7 +838,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -857,7 +857,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -877,7 +877,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: v_mul64_sext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: v_mul64_sext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -965,7 +965,7 @@ entry: define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -985,7 +985,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1024,7 +1024,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: v_mul64_zext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1043,7 +1043,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: v_mul64_zext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1112,7 +1112,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_inline_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1131,7 +1131,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_mul64_sext_inline_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_mul64_sext_inline_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_mul64_sext_inline_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1187,7 +1187,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_mul64_sext_inline_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_mul64_sext_inline_imm: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1256,9 +1256,9 @@ entry: define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: s_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 -; SI-NEXT: s_load_dword s5, s[0:1], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s5, s[2:3], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1269,9 +1269,9 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; VI-LABEL: s_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x4c -; VI-NEXT: s_load_dword s5, s[0:1], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x4c +; VI-NEXT: s_load_dword s5, s[2:3], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1282,40 +1282,41 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; GFX9-LABEL: s_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s0, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_mul_i32 s2, s4, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s2, s3 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mul_i32 s2, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -1326,12 +1327,13 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX12-LABEL: s_mul_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_i32 s2, s2, s3 +; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mul_i32 s2, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1358,7 +1360,7 @@ entry: define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1376,7 +1378,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1394,7 +1396,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1412,7 +1414,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: v_mul_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1430,7 +1432,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: v_mul_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1450,7 +1452,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: v_mul_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1496,9 +1498,9 @@ entry: define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind { ; SI-LABEL: s_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 -; SI-NEXT: s_load_dword s5, s[0:1], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s5, s[2:3], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1510,9 +1512,9 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; VI-LABEL: s_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x70 -; VI-NEXT: s_load_dword s5, s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x70 +; VI-NEXT: s_load_dword s5, s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1524,42 +1526,42 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; GFX9-LABEL: s_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x70 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x70 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mul_lo_u16_e32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mul_lo_u16_e32 v0, s5, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i1: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mul_lo_u16 v0, s2, s3 +; GFX10-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -1570,13 +1572,13 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX12-LABEL: s_mul_i1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3 +; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -1620,7 +1622,7 @@ entry: define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1640,7 +1642,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: v_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1660,7 +1662,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: v_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1680,7 +1682,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX10-LABEL: v_mul_i1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1701,7 +1703,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX11-LABEL: v_mul_i1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1725,7 +1727,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX12-LABEL: v_mul_i1: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1793,8 +1795,8 @@ entry: define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1813,8 +1815,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; VI-LABEL: s_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1831,8 +1833,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; GFX9-LABEL: s_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1852,18 +1854,18 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX10-LABEL: s_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s0, s6, s3 -; GFX10-NEXT: s_mul_hi_u32 s1, s6, s2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_add_i32 s0, s1, s0 -; GFX10-NEXT: s_mul_i32 s1, s7, s2 -; GFX10-NEXT: s_mul_i32 s2, s6, s2 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mul_i32 s1, s6, s1 +; GFX10-NEXT: s_mul_hi_u32 s2, s6, s0 +; GFX10-NEXT: s_add_i32 s1, s2, s1 +; GFX10-NEXT: s_mul_i32 s2, s7, s0 +; GFX10-NEXT: s_mul_i32 s0, s6, s0 +; GFX10-NEXT: s_add_i32 s1, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_mov_b32 s1, s5 @@ -1873,8 +1875,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX11-LABEL: s_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s1, s6, s1 @@ -1896,8 +1898,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX12-LABEL: s_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 @@ -1932,21 +1934,21 @@ entry: define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; SI-LABEL: v_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v1, v2, v1 ; SI-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -1954,52 +1956,52 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; SI-NEXT: v_mul_lo_u32 v0, v2, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v4, v2, v1 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, v0, 0 ; VI-NEXT: v_mul_lo_u32 v0, v3, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s2, s10 +; GFX9-NEXT: s_mov_b32 s3, s11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2007,27 +2009,27 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 -; GFX10-NEXT: s_mov_b32 s14, s2 -; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, s10 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s14, s10 +; GFX10-NEXT: s_mov_b32 s15, s11 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_mov_b32 s9, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2035,14 +2037,14 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2072,8 +2074,8 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX12-LABEL: v_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, s10 @@ -2134,19 +2136,19 @@ entry: define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { ; SI-LABEL: mul32_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mul_i32 s6, s2, s3 +; SI-NEXT: s_mul_i32 s6, s0, s1 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc @@ -2169,19 +2171,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul32_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mul_i32 s6, s2, s3 +; VI-NEXT: s_mul_i32 s6, s0, s1 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: .LBB15_3: ; %Flow -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %if @@ -2204,19 +2206,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul32_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_mul_i32 s6, s2, s3 +; GFX9-NEXT: s_mul_i32 s6, s0, s1 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: .LBB15_3: ; %Flow -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %if @@ -2239,19 +2241,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul32_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s5, s2, s3 +; GFX10-NEXT: s_mul_i32 s5, s0, s1 ; GFX10-NEXT: s_branch .LBB15_3 ; GFX10-NEXT: .LBB15_2: ; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: .LBB15_3: ; %Flow -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX10-NEXT: ; %bb.4: ; %if @@ -2274,19 +2276,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul32_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_mul_i32 s5, s2, s3 +; GFX11-NEXT: s_mul_i32 s5, s0, s1 ; GFX11-NEXT: s_branch .LBB15_3 ; GFX11-NEXT: .LBB15_2: ; GFX11-NEXT: s_mov_b32 s4, -1 ; GFX11-NEXT: ; implicit-def: $sgpr5 ; GFX11-NEXT: .LBB15_3: ; %Flow -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX11-NEXT: ; %bb.4: ; %if @@ -2311,19 +2313,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul32_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX12-NEXT: ; %bb.1: ; %else -; GFX12-NEXT: s_mul_i32 s5, s2, s3 +; GFX12-NEXT: s_mul_i32 s5, s0, s1 ; GFX12-NEXT: s_branch .LBB15_3 ; GFX12-NEXT: .LBB15_2: ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: ; implicit-def: $sgpr5 ; GFX12-NEXT: .LBB15_3: ; %Flow -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX12-NEXT: ; %bb.4: ; %if @@ -2403,7 +2405,7 @@ endif: define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; SI-LABEL: mul64_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -2438,7 +2440,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul64_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2470,7 +2472,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2506,7 +2508,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2540,7 +2542,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2575,7 +2577,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2668,9 +2670,9 @@ endif: define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { ; SI-LABEL: s_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x1f -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x1f +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2717,9 +2719,9 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; VI-LABEL: s_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2756,96 +2758,96 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s12, s11 -; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 -; GFX9-NEXT: s_mul_i32 s2, s14, s9 -; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 -; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s13, s10 -; GFX9-NEXT: s_add_i32 s2, s3, s2 -; GFX9-NEXT: s_mul_i32 s3, s15, s8 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s12, s10 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_i32 s3, s14, s8 -; GFX9-NEXT: s_add_u32 s3, s3, s1 -; GFX9-NEXT: s_addc_u32 s2, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s9, s12 -; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 +; GFX9-NEXT: s_mul_i32 s7, s8, s7 +; GFX9-NEXT: s_mul_hi_u32 s12, s8, s6 +; GFX9-NEXT: s_add_i32 s7, s12, s7 +; GFX9-NEXT: s_mul_i32 s12, s9, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s12 +; GFX9-NEXT: s_mul_i32 s12, s10, s5 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s11, s11, s4 +; GFX9-NEXT: s_mul_i32 s6, s8, s6 +; GFX9-NEXT: s_add_i32 s12, s12, s11 +; GFX9-NEXT: s_mul_i32 s10, s10, s4 +; GFX9-NEXT: s_add_u32 s10, s10, s6 +; GFX9-NEXT: s_addc_u32 s11, s12, s7 +; GFX9-NEXT: s_mul_i32 s14, s5, s8 +; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s13, s5, s8 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s1, s8, s13 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 -; GFX9-NEXT: s_add_u32 s1, s1, s14 -; GFX9-NEXT: s_addc_u32 s10, s10, 0 -; GFX9-NEXT: s_add_u32 s10, s11, s10 -; GFX9-NEXT: s_addc_u32 s11, 0, 0 -; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 -; GFX9-NEXT: s_mul_i32 s9, s9, s13 -; GFX9-NEXT: s_add_u32 s9, s9, s10 -; GFX9-NEXT: s_addc_u32 s10, s14, s11 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 s9, s9, s3 -; GFX9-NEXT: s_addc_u32 s10, s10, s2 -; GFX9-NEXT: s_mul_i32 s2, s8, s12 -; GFX9-NEXT: s_mov_b32 s3, s0 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_mul_i32 s7, s4, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 +; GFX9-NEXT: s_add_u32 s7, s7, s14 +; GFX9-NEXT: s_addc_u32 s12, s12, 0 +; GFX9-NEXT: s_add_u32 s12, s13, s12 +; GFX9-NEXT: s_addc_u32 s13, 0, 0 +; GFX9-NEXT: s_mul_hi_u32 s14, s5, s9 +; GFX9-NEXT: s_mul_i32 s5, s5, s9 +; GFX9-NEXT: s_add_u32 s5, s5, s12 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_addc_u32 s9, s14, s13 +; GFX9-NEXT: s_add_u32 s10, s5, s10 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mov_b32 s5, s6 +; GFX9-NEXT: s_addc_u32 s9, s9, s11 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s13, s2 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s12, 0 +; GFX10-NEXT: s_mov_b32 s3, s12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s3, s8, s7 +; GFX10-NEXT: s_mul_i32 s2, s8, s7 ; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6 ; GFX10-NEXT: s_mul_i32 s14, s10, s5 ; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX10-NEXT: s_mul_i32 s12, s9, s6 +; GFX10-NEXT: s_mul_i32 s13, s9, s6 ; GFX10-NEXT: s_mul_i32 s11, s11, s4 -; GFX10-NEXT: s_add_i32 s3, s7, s3 +; GFX10-NEXT: s_add_i32 s2, s7, s2 ; GFX10-NEXT: s_add_i32 s7, s15, s14 ; GFX10-NEXT: s_mul_i32 s6, s8, s6 ; GFX10-NEXT: s_mul_i32 s10, s10, s4 -; GFX10-NEXT: s_add_i32 s3, s3, s12 +; GFX10-NEXT: s_add_i32 s2, s2, s13 ; GFX10-NEXT: s_add_i32 s7, s7, s11 ; GFX10-NEXT: s_mul_i32 s19, s5, s8 ; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 ; GFX10-NEXT: s_add_u32 s6, s10, s6 ; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX10-NEXT: s_addc_u32 s7, s7, s3 +; GFX10-NEXT: s_addc_u32 s7, s7, s2 ; GFX10-NEXT: s_mul_i32 s17, s4, s9 -; GFX10-NEXT: s_add_u32 s3, s19, s20 +; GFX10-NEXT: s_add_u32 s2, s19, s20 ; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9 ; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9 ; GFX10-NEXT: s_mul_i32 s5, s5, s9 ; GFX10-NEXT: s_addc_u32 s9, s18, 0 -; GFX10-NEXT: s_add_u32 s3, s17, s3 +; GFX10-NEXT: s_add_u32 s13, s17, s2 ; GFX10-NEXT: s_addc_u32 s10, s16, 0 -; GFX10-NEXT: s_mul_i32 s12, s4, s8 +; GFX10-NEXT: s_mul_i32 s2, s4, s8 ; GFX10-NEXT: s_add_u32 s4, s9, s10 ; GFX10-NEXT: s_addc_u32 s8, 0, 0 ; GFX10-NEXT: s_add_u32 s4, s5, s4 ; GFX10-NEXT: s_addc_u32 s5, s21, s8 ; GFX10-NEXT: s_add_u32 s4, s4, s6 ; GFX10-NEXT: s_addc_u32 s5, s5, s7 -; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -2858,46 +2860,46 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX11-LABEL: s_mul_i128: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c -; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x4c +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x7c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_mov_b32 s12, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s13, s2 +; GFX11-NEXT: s_mov_b32 s3, s12 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s3, s8, s7 +; GFX11-NEXT: s_mul_i32 s2, s8, s7 ; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6 ; GFX11-NEXT: s_mul_i32 s14, s10, s5 ; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX11-NEXT: s_mul_i32 s12, s9, s6 +; GFX11-NEXT: s_mul_i32 s13, s9, s6 ; GFX11-NEXT: s_mul_i32 s11, s11, s4 -; GFX11-NEXT: s_add_i32 s3, s7, s3 +; GFX11-NEXT: s_add_i32 s2, s7, s2 ; GFX11-NEXT: s_add_i32 s7, s15, s14 ; GFX11-NEXT: s_mul_i32 s6, s8, s6 ; GFX11-NEXT: s_mul_i32 s10, s10, s4 -; GFX11-NEXT: s_add_i32 s3, s3, s12 +; GFX11-NEXT: s_add_i32 s2, s2, s13 ; GFX11-NEXT: s_add_i32 s7, s7, s11 ; GFX11-NEXT: s_mul_i32 s19, s5, s8 ; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8 ; GFX11-NEXT: s_add_u32 s6, s10, s6 ; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX11-NEXT: s_addc_u32 s7, s7, s3 +; GFX11-NEXT: s_addc_u32 s7, s7, s2 ; GFX11-NEXT: s_mul_i32 s17, s4, s9 -; GFX11-NEXT: s_add_u32 s3, s19, s20 +; GFX11-NEXT: s_add_u32 s2, s19, s20 ; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9 ; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9 ; GFX11-NEXT: s_mul_i32 s5, s5, s9 ; GFX11-NEXT: s_addc_u32 s9, s18, 0 -; GFX11-NEXT: s_add_u32 s3, s17, s3 +; GFX11-NEXT: s_add_u32 s13, s17, s2 ; GFX11-NEXT: s_addc_u32 s10, s16, 0 -; GFX11-NEXT: s_mul_i32 s12, s4, s8 +; GFX11-NEXT: s_mul_i32 s2, s4, s8 ; GFX11-NEXT: s_add_u32 s4, s9, s10 ; GFX11-NEXT: s_addc_u32 s8, 0, 0 ; GFX11-NEXT: s_add_u32 s4, s5, s4 ; GFX11-NEXT: s_addc_u32 s5, s21, s8 ; GFX11-NEXT: s_add_u32 s4, s4, s6 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 -; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 @@ -2911,40 +2913,40 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX12-LABEL: s_mul_i128: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c -; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c -; GFX12-NEXT: s_mov_b32 s3, 0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: s_mov_b32 s15, s3 -; GFX12-NEXT: s_mov_b32 s13, s3 -; GFX12-NEXT: s_mov_b32 s17, s3 -; GFX12-NEXT: s_mov_b32 s19, s3 -; GFX12-NEXT: s_mov_b32 s24, s3 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x7c +; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x4c +; GFX12-NEXT: s_mov_b32 s13, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b32 s15, s13 +; GFX12-NEXT: s_mov_b32 s3, s13 +; GFX12-NEXT: s_mov_b32 s17, s13 +; GFX12-NEXT: s_mov_b32 s19, s13 +; GFX12-NEXT: s_mov_b32 s24, s13 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s2, s4 +; GFX12-NEXT: s_mov_b32 s12, s4 ; GFX12-NEXT: s_mov_b32 s14, s8 -; GFX12-NEXT: s_mov_b32 s12, s9 -; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3] -; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3] -; GFX12-NEXT: s_mov_b32 s2, s23 +; GFX12-NEXT: s_mov_b32 s2, s9 +; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[12:13] +; GFX12-NEXT: s_mul_u64 s[20:21], s[2:3], s[12:13] +; GFX12-NEXT: s_mov_b32 s12, s23 ; GFX12-NEXT: s_mov_b32 s16, s5 ; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11] -; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3] +; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[12:13] ; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9] ; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17] -; GFX12-NEXT: s_mov_b32 s2, s11 -; GFX12-NEXT: s_mov_b32 s11, s3 +; GFX12-NEXT: s_mov_b32 s12, s11 +; GFX12-NEXT: s_mov_b32 s11, s13 ; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5] ; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11] -; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17] +; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[16:17] ; GFX12-NEXT: s_mov_b32 s18, s7 -; GFX12-NEXT: s_mov_b32 s23, s3 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] ; GFX12-NEXT: s_mov_b32 s25, s6 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3] -; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25] +; GFX12-NEXT: s_add_nc_u64 s[6:7], s[12:13], s[18:19] +; GFX12-NEXT: s_mov_b32 s23, s13 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7] +; GFX12-NEXT: s_or_b64 s[8:9], s[22:23], s[24:25] ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 @@ -3011,7 +3013,7 @@ entry: define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { ; SI-LABEL: v_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 @@ -3060,7 +3062,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; VI-LABEL: v_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3100,7 +3102,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3131,7 +3133,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX10-LABEL: v_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3163,7 +3165,9 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX11-LABEL: v_mul_i128: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3201,7 +3205,9 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX12-LABEL: v_mul_i128: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 357b851a8f56f..842dc36e00154 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s2, 0x180000 @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_smul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ entry: define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smulhi24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_smulhi24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,7 +126,7 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -274,26 +274,26 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { ; SI-LABEL: test_smul24_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0x13 -; SI-NEXT: s_load_dword s0, s[0:1], 0x1c -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s5, s[2:3], 0x1c +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 -; SI-NEXT: s_bfe_i32 s0, s0, 0x180000 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_mul_i32 s1, s0, s1 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; SI-NEXT: s_bfe_i32 s5, s5, 0x180000 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mul_i32 s4, s5, s4 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x4c -; VI-NEXT: s_load_dword s5, s[0:1], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x4c +; VI-NEXT: s_load_dword s5, s[2:3], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -307,19 +307,19 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 ; ; GFX9-LABEL: test_smul24_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s5, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s6, s5, s4 +; GFX9-NEXT: s_mul_i32 s5, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i64: @@ -376,8 +376,8 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i64_square: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -390,8 +390,8 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_smul24_i64_square: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -403,17 +403,17 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_smul24_i64_square: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s5, s4, s4 +; GFX9-NEXT: s_mul_i32 s4, s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i64_square: @@ -463,33 +463,33 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 { ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s2, 8 -; SI-NEXT: s_lshl_b32 s3, s0, 8 -; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_mul_i32 s1, s0, s2 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 -; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_lshl_b32 s5, s4, 8 +; SI-NEXT: s_lshl_b32 s7, s6, 8 +; SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 +; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_mul_i32 s5, s4, s6 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dword s5, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s2, 8 -; VI-NEXT: s_lshl_b32 s5, s4, 8 +; VI-NEXT: s_lshl_b32 s3, s4, 8 +; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -504,23 +504,23 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_smul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s1, s2, 8 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; GFX9-NEXT: s_lshl_b32 s1, s3, 8 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 -; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2 -; GFX9-NEXT: s_mul_i32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_lshl_b32 s5, s4, 8 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GFX9-NEXT: s_lshl_b32 s5, s6, 8 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 +; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i33: @@ -580,9 +580,9 @@ entry: define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dword s5, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dword s5, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,9 +594,9 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dword s5, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dword s5, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -608,20 +608,20 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; GFX9-LABEL: test_smulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s1, s2, 8 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; GFX9-NEXT: s_lshl_b32 s1, s3, 8 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 -; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_lshl_b32 s5, s4, 8 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GFX9-NEXT: s_lshl_b32 s5, s6, 8 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 +; GFX9-NEXT: s_mul_hi_i32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i33: @@ -672,15 +672,15 @@ entry: define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { ; SI-LABEL: simplify_i24_crash: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %bb7 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB8_2: ; %bb11 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 @@ -694,15 +694,15 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, ; ; VI-LABEL: simplify_i24_crash: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %bb7 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB8_2: ; %bb11 -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,24 +716,24 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, ; ; GFX9-LABEL: simplify_i24_crash: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %bb7 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB8_2: ; %bb11 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s6, 0x180000 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s6, 0x180000 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: simplify_i24_crash: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 3a16c88f32cc3..0c0bb830ba847 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffffff @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_umul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -64,13 +64,13 @@ entry: define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16_sext: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: s_mul_i32 s2, s2, s4 -; SI-NEXT: s_sext_i32_i16 s4, s2 +; SI-NEXT: s_lshr_b32 s2, s4, 16 +; SI-NEXT: s_mul_i32 s4, s4, s2 +; SI-NEXT: s_sext_i32_i16 s4, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -78,8 +78,8 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; VI-LABEL: test_umul24_i16_sext: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,16 +92,16 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; GFX9-LABEL: test_umul24_i16_sext: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: s_mul_i32 s2, s2, s0 -; GFX9-NEXT: s_sext_i32_i16 s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_sext_i32_i16 s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %mul = mul i16 %a, %b @@ -113,7 +113,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr_sext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; VI-LABEL: test_umul24_i16_vgpr_sext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -158,7 +158,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: test_umul24_i16_vgpr_sext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -186,13 +186,13 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: s_mul_i32 s2, s2, s4 -; SI-NEXT: s_and_b32 s4, s2, 0xffff +; SI-NEXT: s_lshr_b32 s2, s4, 16 +; SI-NEXT: s_mul_i32 s4, s4, s2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -200,8 +200,8 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; VI-LABEL: test_umul24_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -214,16 +214,16 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; GFX9-LABEL: test_umul24_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: s_mul_i32 s2, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %mul = mul i16 %a, %b @@ -235,7 +235,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -258,7 +258,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_umul24_i16_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_umul24_i16_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -307,8 +307,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; SI-LABEL: test_umul24_i8_vgpr: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: v_mov_b32_e32 v4, 0 @@ -330,8 +330,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: test_umul24_i8_vgpr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 @@ -351,11 +351,11 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: test_umul24_i8_vgpr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v3, v1, s[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -379,7 +379,7 @@ entry: define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi24_i32_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -392,7 +392,7 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_umulhi24_i32_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,7 +405,7 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -432,9 +432,9 @@ entry: define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umulhi24: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -447,9 +447,9 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: test_umulhi24: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[0:1], 0x34 +; VI-NEXT: s_load_dword s7, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -462,19 +462,18 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX9-LABEL: test_umulhi24: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i64 %a, 16777215 @@ -490,9 +489,9 @@ entry: define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umul24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -509,9 +508,9 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b ; ; VI-LABEL: test_umul24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[0:1], 0x34 +; VI-NEXT: s_load_dword s7, s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -525,21 +524,20 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b ; ; GFX9-LABEL: test_umul24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s2, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s1, s0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -582,8 +580,8 @@ define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: test_umul24_i64_square: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -596,8 +594,8 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: test_umul24_i64_square: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -608,17 +606,17 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3 ; ; GFX9-LABEL: test_umul24_i64_square: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s4 +; GFX9-NEXT: s_mul_i32 s4, s4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -631,7 +629,7 @@ entry: define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi16_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffff @@ -647,7 +645,7 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_umulhi16_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -663,7 +661,7 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -685,27 +683,27 @@ entry: define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dword s5, s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s1, s2, 0xffffff -; SI-NEXT: s_and_b32 s3, s0, 0xffffff -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 -; SI-NEXT: s_mul_i32 s1, s1, s3 +; SI-NEXT: s_and_b32 s6, s4, 0xffffff +; SI-NEXT: s_and_b32 s7, s5, 0xffffff +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0 +; SI-NEXT: s_mul_i32 s6, s6, s7 ; SI-NEXT: v_and_b32_e32 v1, 1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dword s5, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dword s5, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -718,20 +716,20 @@ define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_umul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_i32 s2, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff +; GFX9-NEXT: s_mul_i32 s6, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i33 %a, 9 @@ -747,9 +745,9 @@ entry: define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dword s5, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dword s5, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -761,9 +759,9 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; VI-LABEL: test_umulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dword s5, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dword s5, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -775,18 +773,18 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; GFX9-LABEL: test_umulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i33 %a, 9 diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index 16de2c0c6de08..727b607e7ded0 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -163,7 +163,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: multi_if_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll index f6e3509eb029b..296d484e247d6 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocapture %arg) #0 { ; GCN-LABEL: reduced_nested_loop_conditions: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: s_mov_b32 s2, 0 @@ -93,7 +93,6 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap ; IR: bb23: ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) ; IR-NEXT: ret void -; bb: %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %my.tmp1 = getelementptr inbounds i64, ptr addrspace(3) %arg, i32 %my.tmp @@ -277,7 +276,6 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]]) ; IR-NEXT: store volatile i32 0, ptr addrspace(1) undef, align 4 ; IR-NEXT: ret void -; bb: %my.tmp1134 = load volatile i32, ptr addrspace(1) undef %my.tmp1235 = icmp slt i32 %my.tmp1134, 9 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index ba012b208c957..b84686139d0e2 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -120,61 +120,61 @@ bb.2: ; ASSUME1024: ; ScratchSize: 1040 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { -; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: -; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; MUBUF-NEXT: s_add_u32 s0, s0, s9 -; MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; MUBUF-NEXT: s_mov_b32 s33, 0 -; MUBUF-NEXT: s_movk_i32 s32, 0x1000 -; MUBUF-NEXT: s_waitcnt lgkmcnt(0) -; MUBUF-NEXT: s_cmp_lg_u32 s6, 0 -; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2 -; MUBUF-NEXT: ; %bb.1: ; %bb.0 -; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 -; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 -; MUBUF-NEXT: s_lshl_b32 s7, s7, 2 -; MUBUF-NEXT: s_mov_b32 s32, s6 -; MUBUF-NEXT: v_mov_b32_e32 v1, 0 -; MUBUF-NEXT: v_mov_b32_e32 v2, s6 -; MUBUF-NEXT: v_mov_b32_e32 v3, 1 -; MUBUF-NEXT: s_add_i32 s6, s6, s7 -; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: v_mov_b32_e32 v2, s6 -; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 -; MUBUF-NEXT: s_waitcnt lgkmcnt(0) -; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] -; MUBUF-NEXT: .LBB1_2: ; %bb.1 -; MUBUF-NEXT: v_mov_b32_e32 v0, 0 -; MUBUF-NEXT: global_store_dword v[0:1], v0, off -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_endpgm +; DEFAULTSIZE-V5-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: +; DEFAULTSIZE-V5: ; %bb.0: ; %entry +; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 +; DEFAULTSIZE-V5-NEXT: s_add_u32 s0, s0, s15 +; DEFAULTSIZE-V5-NEXT: s_addc_u32 s1, s1, 0 +; DEFAULTSIZE-V5-NEXT: s_mov_b32 s33, 0 +; DEFAULTSIZE-V5-NEXT: s_movk_i32 s32, 0x1000 +; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULTSIZE-V5-NEXT: s_cmp_lg_u32 s4, 0 +; DEFAULTSIZE-V5-NEXT: s_cbranch_scc1 .LBB1_2 +; DEFAULTSIZE-V5-NEXT: ; %bb.1: ; %bb.0 +; DEFAULTSIZE-V5-NEXT: s_add_i32 s4, s32, 0x1000 +; DEFAULTSIZE-V5-NEXT: s_and_b32 s4, s4, 0xfffff000 +; DEFAULTSIZE-V5-NEXT: s_lshl_b32 s5, s5, 2 +; DEFAULTSIZE-V5-NEXT: s_mov_b32 s32, s4 +; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v1, 0 +; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s4 +; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v3, 1 +; DEFAULTSIZE-V5-NEXT: s_add_i32 s4, s4, s5 +; DEFAULTSIZE-V5-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; DEFAULTSIZE-V5-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s4 +; DEFAULTSIZE-V5-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0) +; DEFAULTSIZE-V5-NEXT: v_add_u32_e32 v0, v2, v0 +; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULTSIZE-V5-NEXT: global_store_dword v1, v0, s[4:5] +; DEFAULTSIZE-V5-NEXT: .LBB1_2: ; %bb.1 +; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v0, 0 +; DEFAULTSIZE-V5-NEXT: global_store_dword v[0:1], v0, off +; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0) +; DEFAULTSIZE-V5-NEXT: s_endpgm ; ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: s_mov_b32 s32, 64 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: s_cmp_lg_u32 s2, 0 +; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 -; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 +; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 +; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2 -; FLATSCR-NEXT: s_mov_b32 s32, s2 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 -; FLATSCR-NEXT: s_add_i32 s2, s2, s3 -; FLATSCR-NEXT: scratch_load_dword v2, off, s2 -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2 +; FLATSCR-NEXT: s_mov_b32 s32, s0 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 +; FLATSCR-NEXT: s_add_i32 s0, s0, s1 +; FLATSCR-NEXT: scratch_load_dword v2, off, s0 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -406,3 +406,6 @@ attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amd !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; ASSUME1024: {{.*}} +; DEFAULTSIZE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index 9ab3eccd986a5..5c09d2bd61a39 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -2104,7 +2104,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2115,7 +2115,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 1 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2128,7 +2128,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc @@ -2138,7 +2138,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS @@ -2154,7 +2154,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2165,7 +2165,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2178,7 +2178,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc @@ -2188,7 +2188,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS @@ -2204,7 +2204,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2215,7 +2215,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2228,7 +2228,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc @@ -2238,7 +2238,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS @@ -2254,7 +2254,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2267,7 +2267,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2280,7 +2280,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2292,7 +2292,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS @@ -2302,7 +2302,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2315,7 +2315,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2334,7 +2334,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2347,7 +2347,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2360,7 +2360,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2372,7 +2372,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS @@ -2382,7 +2382,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2395,7 +2395,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2414,7 +2414,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2427,7 +2427,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2440,7 +2440,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2452,7 +2452,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS @@ -2462,7 +2462,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2475,7 +2475,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2494,7 +2494,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2507,7 +2507,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2520,7 +2520,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2532,7 +2532,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS @@ -2542,7 +2542,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2555,7 +2555,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2574,7 +2574,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2585,7 +2585,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2598,7 +2598,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc @@ -2608,7 +2608,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS @@ -2624,7 +2624,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2637,7 +2637,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2650,7 +2650,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2662,7 +2662,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS @@ -2672,7 +2672,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2685,7 +2685,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2704,7 +2704,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2717,7 +2717,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2730,7 +2730,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2742,7 +2742,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS @@ -2752,7 +2752,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2765,7 +2765,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2784,7 +2784,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2797,7 +2797,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2822,7 +2822,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS @@ -2832,7 +2832,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2845,7 +2845,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2864,7 +2864,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2877,7 +2877,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2890,7 +2890,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2902,7 +2902,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2925,7 +2925,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2944,7 +2944,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2957,7 +2957,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2970,7 +2970,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2982,7 +2982,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS @@ -2992,7 +2992,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -3005,7 +3005,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -3025,7 +3025,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -3037,7 +3037,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3050,7 +3050,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3062,7 +3062,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3074,7 +3074,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3087,7 +3087,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3100,7 +3100,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3120,7 +3120,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -3132,7 +3132,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3145,7 +3145,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3157,7 +3157,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3169,7 +3169,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3182,7 +3182,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3195,7 +3195,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3215,7 +3215,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -3227,7 +3227,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3240,7 +3240,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3252,7 +3252,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3264,7 +3264,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3277,7 +3277,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3290,7 +3290,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3310,7 +3310,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3323,7 +3323,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3336,7 +3336,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3348,7 +3348,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3360,7 +3360,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3373,7 +3373,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3386,7 +3386,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3406,7 +3406,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3419,7 +3419,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3432,7 +3432,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3456,7 +3456,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3469,7 +3469,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3482,7 +3482,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3502,7 +3502,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3515,7 +3515,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3528,7 +3528,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3540,7 +3540,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3552,7 +3552,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3565,7 +3565,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3578,7 +3578,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3598,7 +3598,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3612,7 +3612,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3625,7 +3625,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0 @@ -3638,7 +3638,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3651,7 +3651,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3664,7 +3664,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3677,7 +3677,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3697,7 +3697,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3711,7 +3711,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3724,7 +3724,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0 @@ -3737,7 +3737,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3750,7 +3750,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3763,7 +3763,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3776,7 +3776,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3796,7 +3796,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3810,7 +3810,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3823,7 +3823,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0 @@ -3836,7 +3836,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3849,7 +3849,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3862,7 +3862,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3875,7 +3875,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3895,7 +3895,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3909,7 +3909,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3922,7 +3922,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0 @@ -3935,7 +3935,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3948,7 +3948,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3961,7 +3961,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3974,7 +3974,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3994,7 +3994,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -4008,7 +4008,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4021,7 +4021,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0 @@ -4034,7 +4034,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -4047,7 +4047,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4060,7 +4060,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4073,7 +4073,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -4093,7 +4093,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -4107,7 +4107,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4120,7 +4120,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0 @@ -4133,7 +4133,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -4146,7 +4146,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4159,7 +4159,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4172,7 +4172,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index 10381bc21ecc9..b5b8213bcd57e 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -2176,7 +2176,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc @@ -2186,7 +2186,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX10-LABEL: global_inst_salu_offset_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc dlc @@ -2196,7 +2196,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX11-LABEL: global_inst_salu_offset_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc @@ -2208,7 +2208,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_salu_offset_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS @@ -2226,7 +2226,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc @@ -2236,7 +2236,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2246,7 +2246,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS @@ -2276,7 +2276,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2286,7 +2286,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2296,7 +2296,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2308,7 +2308,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS @@ -2326,7 +2326,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2336,7 +2336,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2346,7 +2346,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_13bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2358,7 +2358,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS @@ -2376,7 +2376,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc @@ -2386,7 +2386,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc dlc @@ -2396,7 +2396,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc @@ -2408,7 +2408,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS @@ -2426,7 +2426,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc @@ -2436,7 +2436,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2449,7 +2449,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc @@ -2461,7 +2461,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS @@ -2473,7 +2473,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2490,7 +2490,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 @@ -2502,7 +2502,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2515,7 +2515,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2530,7 +2530,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS @@ -2542,7 +2542,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2553,7 +2553,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2573,7 +2573,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2583,7 +2583,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2593,7 +2593,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2605,7 +2605,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS @@ -2623,7 +2623,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2633,7 +2633,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2643,7 +2643,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2655,7 +2655,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS @@ -2673,7 +2673,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2683,7 +2683,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2693,7 +2693,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2705,7 +2705,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS @@ -2723,7 +2723,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc @@ -2733,7 +2733,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2746,7 +2746,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc @@ -2758,7 +2758,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS @@ -2770,7 +2770,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2787,7 +2787,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 @@ -2799,7 +2799,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2812,7 +2812,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2827,7 +2827,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS @@ -2839,7 +2839,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2850,7 +2850,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2870,7 +2870,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffc000 @@ -2882,7 +2882,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2895,7 +2895,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2910,7 +2910,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS @@ -2922,7 +2922,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2933,7 +2933,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2954,7 +2954,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff @@ -2966,7 +2966,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2979,7 +2979,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2994,7 +2994,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3009,7 +3009,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3020,7 +3020,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3034,7 +3034,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3055,7 +3055,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x800 @@ -3067,7 +3067,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3080,7 +3080,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3095,7 +3095,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3110,7 +3110,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3121,7 +3121,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3135,7 +3135,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3156,7 +3156,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xfff @@ -3168,7 +3168,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3181,7 +3181,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3196,7 +3196,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3211,7 +3211,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3222,7 +3222,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3236,7 +3236,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3257,7 +3257,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3269,7 +3269,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3282,7 +3282,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3297,7 +3297,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3312,7 +3312,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3323,7 +3323,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3337,7 +3337,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3358,7 +3358,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3370,7 +3370,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3383,7 +3383,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3398,7 +3398,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3413,7 +3413,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3424,7 +3424,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3438,7 +3438,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3459,7 +3459,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3471,7 +3471,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3484,7 +3484,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3499,7 +3499,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3514,7 +3514,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3525,7 +3525,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3539,7 +3539,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3560,7 +3560,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3572,7 +3572,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3584,7 +3584,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3598,7 +3598,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff @@ -3612,7 +3612,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x7ff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3634,7 +3634,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x800 @@ -3646,7 +3646,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 @@ -3658,7 +3658,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x800 @@ -3672,7 +3672,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 @@ -3686,7 +3686,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x800 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3708,7 +3708,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xfff @@ -3720,7 +3720,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff @@ -3732,7 +3732,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0xfff @@ -3746,7 +3746,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff @@ -3760,7 +3760,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xfff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3782,7 +3782,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3794,7 +3794,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3806,7 +3806,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3820,7 +3820,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 @@ -3834,7 +3834,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1000 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3868,7 +3868,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3880,7 +3880,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3894,7 +3894,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff @@ -3908,7 +3908,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1fff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3930,7 +3930,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3942,7 +3942,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3954,7 +3954,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3968,7 +3968,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 @@ -3982,7 +3982,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x2000 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll index 769d035858ca8..df15f98ae27ff 100644 --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; SI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -25,7 +25,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -43,13 +43,14 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -58,13 +59,14 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -84,7 +86,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; SI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -101,7 +103,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -119,13 +121,14 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -134,13 +137,14 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -160,7 +164,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -177,7 +181,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -195,13 +199,14 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -210,13 +215,14 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -236,7 +242,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #5 { ; SI-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -253,7 +259,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -271,13 +277,14 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -286,13 +293,14 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll index bd7f9014d55ca..d73b1bd29c981 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll @@ -4,14 +4,14 @@ define amdgpu_kernel void @if_masked_1(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp0_b32 s4, 0 -; GCN-NEXT: s_cselect_b32 s0, 22, 33 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-NEXT: s_cselect_b32 s2, 22, 33 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] ; GCN-NEXT: s_endpgm %and = and i32 %arg, 1 %cmp = icmp eq i32 %and, 0 @@ -23,14 +23,14 @@ define amdgpu_kernel void @if_masked_1(i32 %arg, ptr addrspace(1) %p) { define amdgpu_kernel void @if_masked_1024(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_1024: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp0_b32 s4, 10 -; GCN-NEXT: s_cselect_b32 s0, 22, 33 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-NEXT: s_cselect_b32 s2, 22, 33 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] ; GCN-NEXT: s_endpgm %and = and i32 %arg, 1024 %cmp = icmp eq i32 %and, 0 @@ -42,14 +42,14 @@ define amdgpu_kernel void @if_masked_1024(i32 %arg, ptr addrspace(1) %p) { define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_0x80000000: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp0_b32 s4, 31 -; GCN-NEXT: s_cselect_b32 s0, 22, 33 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-NEXT: s_cselect_b32 s2, 22, 33 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] ; GCN-NEXT: s_endpgm %and = and i32 %arg, 2147483648 %cmp = icmp eq i32 %and, 0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p) define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_0x8000000000000000: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index a50a0766f67c2..4ee2b8e981f44 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-LABEL: negated_cond: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_mov_b32 s6, 0 @@ -92,7 +92,7 @@ bb4: define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) { ; GCN-LABEL: negated_cond_dominated_blocks: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s6, 0 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 0473f803bfb30..eff80236d9866 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -70,7 +70,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -92,7 +92,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: scalar_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -156,7 +156,7 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX8-LABEL: scalar_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -185,40 +185,40 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) { ; GFX6-LABEL: vector_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s12, s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s12, s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s2, s10 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s11, s3 -; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s0, s6 +; GFX6-NEXT: s_mov_b32 s1, s7 +; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s12, s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s12, s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s2, s10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s11, s3 -; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: s_mov_b32 s0, s6 +; GFX8-NEXT: s_mov_b32 s1, s7 +; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_mov_b32 s9, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i32: @@ -246,8 +246,8 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: scalar_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -258,8 +258,8 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; ; GFX8-LABEL: scalar_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -286,8 +286,8 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_literal_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -300,8 +300,8 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; ; GFX8-LABEL: scalar_or_literal_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -332,43 +332,43 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: scalar_or_literal_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x1d ; GFX6-NEXT: s_movk_i32 s8, 0x3039 ; GFX6-NEXT: s_mov_b32 s9, 0xf237b -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_add_u32 s0, s0, 0x3039 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX6-NEXT: s_addc_u32 s1, s1, 0xf237b +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_add_u32 s0, s6, 0x3039 +; GFX6-NEXT: s_addc_u32 s1, s7, 0xf237b ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_literal_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x74 ; GFX8-NEXT: s_movk_i32 s8, 0x3039 ; GFX8-NEXT: s_mov_b32 s9, 0xf237b ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s0, s0, 0x3039 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_add_u32 s0, s2, 0x3039 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8-NEXT: s_addc_u32 s1, s1, 0xf237b +; GFX8-NEXT: s_addc_u32 s1, s3, 0xf237b ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -408,8 +408,8 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -421,8 +421,8 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; ; GFX8-LABEL: scalar_or_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -451,44 +451,44 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_or_b32 s4, s6, 63 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_or_b32 s2, s6, 63 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6-NEXT: s_add_u32 s0, s8, 63 -; GFX6-NEXT: s_addc_u32 s1, s9, 0 +; GFX6-NEXT: s_add_u32 s0, s0, 63 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_or_b32 s4, s6, 63 -; GFX8-NEXT: s_mov_b32 s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_or_b32 s2, s6, 63 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_add_u32 s0, s8, 63 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_add_u32 s0, s0, 63 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm ; @@ -521,8 +521,8 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_neg_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, -1 @@ -534,8 +534,8 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; ; GFX8-LABEL: scalar_or_neg_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 @@ -565,7 +565,7 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -583,7 +583,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -624,7 +624,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_inline_immediate_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -642,7 +642,7 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; ; GFX8-LABEL: vector_or_inline_immediate_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -683,8 +683,8 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -698,8 +698,8 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX8-LABEL: scalar_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -730,48 +730,48 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_mov_b32 s3, s11 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s2 -; GFX6-NEXT: s_mov_b32 s15, s3 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s14, s10 +; GFX6-NEXT: s_mov_b32 s15, s11 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_mov_b32 s10, s2 -; GFX8-NEXT: s_mov_b32 s11, s3 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_mov_b32 s3, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s2 -; GFX8-NEXT: s_mov_b32 s15, s3 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s14, s10 +; GFX8-NEXT: s_mov_b32 s15, s11 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_mov_b32 s9, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i64: @@ -803,42 +803,42 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) { ; GFX6-LABEL: scalar_vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s2, s10 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s11, s3 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s0, s6 +; GFX6-NEXT: s_mov_b32 s1, s7 +; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s2, s10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s11, s3 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: s_mov_b32 s0, s6 +; GFX8-NEXT: s_mov_b32 s1, s7 +; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_mov_b32 s9, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: scalar_vector_or_i64: @@ -867,7 +867,7 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_loadimm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -886,7 +886,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_i64_loadimm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -931,7 +931,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -949,7 +949,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: vector_or_i64_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -990,7 +990,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_inline_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; ; GFX8-LABEL: vector_or_i64_neg_inline_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1053,7 +1053,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_literal: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: vector_or_i64_neg_literal: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1116,9 +1116,9 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: trunc_i64_or_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x13 -; GFX6-NEXT: s_load_dword s5, s[0:1], 0x1d -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 +; GFX6-NEXT: s_load_dword s5, s[2:3], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1129,9 +1129,9 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; ; GFX8-LABEL: trunc_i64_or_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX8-NEXT: s_load_dword s5, s[2:3], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1159,21 +1159,21 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; GFX6-LABEL: or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s10, s2 -; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_mov_b32 s3, s11 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s2 -; GFX6-NEXT: s_mov_b32 s15, s3 -; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s14, s10 +; GFX6-NEXT: s_mov_b32 s15, s11 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX6-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_waitcnt vmcnt(1) ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1181,26 +1181,26 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX6-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_mov_b32 s10, s2 -; GFX8-NEXT: s_mov_b32 s11, s3 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_mov_b32 s3, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s2 -; GFX8-NEXT: s_mov_b32 s15, s3 -; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s14, s10 +; GFX8-NEXT: s_mov_b32 s15, s11 +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX8-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_mov_b32 s9, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: or_i1: @@ -1244,8 +1244,8 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GFX6-LABEL: s_or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1260,8 +1260,8 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; ; GFX8-LABEL: s_or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll index e21b93a386c3e..5792fab7011af 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX8-LABEL: s_pack_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX7-LABEL: s_pack_v2f16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2f16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX8-LABEL: s_pack_v2f16_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -89,7 +89,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX7-LABEL: s_pack_v2f16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; GFX9-LABEL: s_pack_v2f16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -125,7 +125,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX8-LABEL: s_pack_v2f16_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -138,7 +138,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX7-LABEL: s_pack_v2f16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -162,7 +162,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -178,7 +178,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX8-LABEL: v_pack_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -200,7 +200,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX7-LABEL: v_pack_v2f16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -240,7 +240,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -258,7 +258,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX8-LABEL: v_pack_v2f16_user: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -282,7 +282,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX7-LABEL: v_pack_v2f16_user: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -324,7 +324,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -339,7 +339,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX8-LABEL: v_pack_v2f16_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -356,7 +356,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX7-LABEL: v_pack_v2f16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -386,7 +386,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX8-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -418,7 +418,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX7-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -448,7 +448,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX8-LABEL: v_pack_v2f16_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -480,7 +480,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX7-LABEL: v_pack_v2f16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -510,7 +510,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -525,7 +525,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; ; GFX8-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -542,7 +542,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; ; GFX7-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -572,7 +572,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -586,7 +586,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX8-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -603,7 +603,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX7-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll index 4b21493bd7ca6..529e64715500d 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX803-LABEL: s_pack_v2i16: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX7-LABEL: s_pack_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2i16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -74,7 +74,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX803-LABEL: s_pack_v2i16_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -87,7 +87,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX7-LABEL: s_pack_v2i16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -110,7 +110,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; GFX9-LABEL: s_pack_v2i16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -122,7 +122,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX803-LABEL: s_pack_v2i16_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -135,7 +135,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX7-LABEL: s_pack_v2i16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -158,7 +158,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX803-LABEL: v_pack_v2i16: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -196,7 +196,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX7-LABEL: v_pack_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -234,7 +234,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -252,7 +252,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX803-LABEL: v_pack_v2i16_user: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX7-LABEL: v_pack_v2i16_user: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -316,7 +316,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -331,7 +331,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX803-LABEL: v_pack_v2i16_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -348,7 +348,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX7-LABEL: v_pack_v2i16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -391,7 +391,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX803-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX7-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -437,7 +437,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2i16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -452,7 +452,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX803-LABEL: v_pack_v2i16_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -469,7 +469,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX7-LABEL: v_pack_v2i16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -498,7 +498,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -512,7 +512,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX803-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -529,7 +529,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX7-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll index a3f7906a05f6b..c72a7ba3eee83 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -803,5 +803,5 @@ bb: declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 45fbaaabc65b5..58b61510c24e8 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -111,4 +111,4 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) -attributes #0 = { nounwind "amdgpu-num-vgpr"="5" } +attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll index 8d180e7d33f84..560f0a0679810 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll @@ -97,7 +97,7 @@ define <2 x i16> @trunc_srl_v2i64_16_to_v2i16(<2 x i64> %x) { define amdgpu_kernel void @s_trunc_srl_i64_16_to_i16(i64 %x) { ; GCN-LABEL: s_trunc_srl_i64_16_to_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s0, 4 diff --git a/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll b/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll index 031a46271f2c0..8f450e5bcb83f 100644 --- a/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll @@ -4,10 +4,10 @@ declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) ; OBJ-LABEL: : -; OBJ: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; OBJ: v_permlane16_b32 v0, v0, s5, s6 op_sel:[1,0] ; ASM-LABEL: permlane_op_sel: -; ASM: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; encoding: [0x00,0x08,0x77,0xd7,0x00,0x0f,0x00,0x00] +; ASM: v_permlane16_b32 v0, v0, s5, s6 op_sel:[1,0] ; encoding: [0x00,0x08,0x77,0xd7,0x00,0x0b,0x18,0x00] define amdgpu_kernel void @permlane_op_sel(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0) store i32 %v, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll b/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll index caa7fb8df1990..4ae0547d11fff 100644 --- a/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll +++ b/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll @@ -8,6 +8,7 @@ declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() + define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec @@ -45,8 +46,9 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 } define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { - ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec +; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec +; SDAG-GFX11: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec +; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -124,7 +126,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src } define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { - ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -167,7 +170,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i3 } define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec + ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec + ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -176,7 +180,8 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 % } define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -186,7 +191,8 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -195,7 +201,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr } define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANE16_B32_e64 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -205,7 +212,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -215,7 +223,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANE16_B32_e64 4, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -225,7 +234,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i } define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec + ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec + ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -234,7 +244,8 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -244,7 +255,8 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3 } define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -253,7 +265,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s } define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANEX16_B32_e64 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -263,7 +276,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -273,7 +287,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX10: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG-GFX11: V_PERMLANEX16_B32_e64 4, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll index 6cab2b1839307..69ddc9a48dbc4 100644 --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -4,17 +4,17 @@ define amdgpu_kernel void @lsh8_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh8_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x6050400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -31,17 +31,17 @@ bb: define amdgpu_kernel void @lsr24_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsr24_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7060503 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, s0, v2, v3 +; GCN-NEXT: v_perm_b32 v2, s2, v2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -58,17 +58,17 @@ bb: define amdgpu_kernel void @and_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7060503 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -87,17 +87,17 @@ bb: define amdgpu_kernel void @and_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020500 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -115,17 +115,17 @@ bb: define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh8_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, s0, v2, v3 +; GCN-NEXT: v_perm_b32 v2, s2, v2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -142,17 +142,17 @@ bb: define amdgpu_kernel void @lsh16_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh16_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x5040c03 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -169,17 +169,17 @@ bb: define amdgpu_kernel void @and_xor_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_xor_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -197,15 +197,15 @@ bb: define amdgpu_kernel void @and_or_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_and_b32 s0, s0, 0xff00 +; GCN-NEXT: s_and_b32 s0, s2, 0xff00 ; GCN-NEXT: s_or_b32 s0, s0, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 @@ -227,17 +227,17 @@ bb: define amdgpu_kernel void @and_or_and_shl(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_and_shl: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x50c0c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -255,17 +255,17 @@ bb: define amdgpu_kernel void @or_and_or(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: or_and_or: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -283,20 +283,20 @@ bb: define amdgpu_kernel void @known_ffff0500(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_ffff0500: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] -; GCN-NEXT: s_bitset1_b32 s0, 15 -; GCN-NEXT: s_and_b32 s0, s0, 0xff00 +; GCN-NEXT: s_bitset1_b32 s2, 15 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_and_b32 s0, s2, 0xff00 ; GCN-NEXT: s_or_b32 s0, s0, 0xffff0000 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v4, 4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xff00ff, v4 @@ -323,21 +323,21 @@ bb: define amdgpu_kernel void @known_050c0c00(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_050c0c00: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0x50c0c00 ; GCN-NEXT: v_mov_b32_e32 v6, 4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] -; GCN-NEXT: s_or_b32 s0, s0, 4 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_b32 s2, s2, 4 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v4, v4, s0, v5 +; GCN-NEXT: v_perm_b32 v4, v4, s2, v5 ; GCN-NEXT: flat_store_dword v[0:1], v4 ; GCN-NEXT: flat_store_dword v[2:3], v6 ; GCN-NEXT: s_endpgm @@ -359,22 +359,22 @@ bb: define amdgpu_kernel void @known_ffff8004(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_ffff8004: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0xffff0500 ; GCN-NEXT: v_mov_b32_e32 v6, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] -; GCN-NEXT: s_or_b32 s0, s0, 4 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_b32 s2, s2, 4 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_perm_b32 v4, v4, s0, v5 +; GCN-NEXT: v_perm_b32 v4, v4, s2, v5 ; GCN-NEXT: flat_store_dword v[0:1], v4 ; GCN-NEXT: flat_store_dword v[2:3], v6 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 048a7756a7a04..bf98af33dc7b0 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -609,53 +609,53 @@ define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %i ; GFX10-LABEL: shuffle8i8: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s1, s1, 8 +; GFX10-NEXT: s_lshr_b32 s3, s3, 8 ; GFX10-NEXT: s_lshr_b32 s4, s9, 16 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, s9 ; GFX10-NEXT: v_and_b32_e64 v1, 0xffffff00, s8 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, s4 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, s8 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: v_or_b32_sdwa v0, s1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: v_or_b32_sdwa v0, s3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: shuffle8i8: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffffff00 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s3, s3, 8 ; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s9 -; GFX9-NEXT: v_or_b32_sdwa v4, s1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s1, s9, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: v_or_b32_sdwa v4, s3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s3, s9, 16 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 8, s8 ; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s1 -; GFX9-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s3 +; GFX9-NEXT: v_or_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm bb: %vec0 = load <8 x i8>, ptr addrspace(1) %in0 diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll index 4794c29621525..f53ca53518a17 100644 --- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll +++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 { ; GCN-LABEL: dbg_clause: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll index a030f86da1b67..5a03381447d0e 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll @@ -8,7 +8,7 @@ ; NON-HSA: s_endpgm ; ASM: .fill 63, 4, 0xbf800000 ; s_nop 0 ; OBJ-COUNT-63: s_nop 0 -define amdgpu_kernel void @preload_kernarg_header(ptr %arg) { +define amdgpu_kernel void @preload_kernarg_header(ptr inreg %arg) { store ptr %arg, ptr %arg ret void } diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index e076df97e1ba4..a547c258e3921 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -1,14 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s -define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg: +define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i8: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -19,27 +23,51 @@ define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-1-LABEL: ptr1_i8: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_i8: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-4-LABEL: ptr1_i8: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_i8: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i8: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -50,32 +78,56 @@ define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-PRELOAD-1-LABEL: ptr1_i8: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_i8: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-PRELOAD-4-LABEL: ptr1_i8: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_i8: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg: +define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -86,29 +138,51 @@ define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -119,34 +193,56 @@ define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg: +define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -157,27 +253,51 @@ define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -188,32 +308,56 @@ define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i1 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg: +define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -223,25 +367,47 @@ define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i3 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -251,30 +417,52 @@ define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i32 %arg0, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 { -; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: +define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 { +; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dword s5, s[0:1], 0x0 @@ -286,29 +474,55 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrsp ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x10 +; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX940-PRELOAD-1-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s5, s4 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s2, s0 +; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s5, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s2, s6 +; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX940-PRELOAD-4-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s5, s4 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s5, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -320,34 +534,60 @@ define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrsp ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX90a-PRELOAD-1-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s3, s2 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s6, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[8:9] +; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s3, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s6, s10 +; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX90a-PRELOAD-4-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s3, s2 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[8:9] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s3, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %add = add i32 %arg0, %arg1 store i32 %add, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg: +define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -360,33 +600,59 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff +; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-1-NEXT: s_and_b32 s1, s4, 0xffff +; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s1, s0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff ; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-4-NEXT: s_and_b32 s1, s4, 0xffff +; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s1, s0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 ; GFX940-PRELOAD-8-NEXT: s_and_b32 s1, s4, 0xffff ; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -399,30 +665,56 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff +; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_lshr_b32 s3, s2, 16 +; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s2, s3 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 -; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s3, s2, 16 +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s2, s3 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff -; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 +; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s3, s2, 16 +; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s2, s3 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s3, s2, 16 +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s2, s3 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 %ext1 = zext i16 %arg1 to i32 @@ -431,8 +723,8 @@ define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: +define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -442,29 +734,47 @@ define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, < ; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-PRELOAD-2-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -474,34 +784,52 @@ define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, < ; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-PRELOAD-2-NEXT: global_store_short v1, v0, s[6:7] +; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] +; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <2 x i8> %in, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { -; GFX940-NO-PRELOAD-LABEL: byref_kernel_preload_arg: +define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { +; GFX940-NO-PRELOAD-LABEL: byref_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -515,37 +843,63 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a ; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 +; GFX940-PRELOAD-1-LABEL: byref_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: byref_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 +; GFX940-PRELOAD-4-LABEL: byref_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: byref_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: byref_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 @@ -559,33 +913,59 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a ; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-1-LABEL: byref_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s1 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[2:3] +; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: byref_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-4-LABEL: byref_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s1 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[2:3] +; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: byref_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-8-NEXT: s_endpgm %in = load i32, ptr addrspace(4) %in.byref @@ -595,8 +975,8 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a } -define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: +define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v8i32_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 @@ -615,47 +995,83 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-1-LABEL: v8i32_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_nop 1 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v8i32_arg: +; GFX940-PRELOAD-2: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_nop 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-4-LABEL: v8i32_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_nop 1 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v8i32_arg: +; GFX940-PRELOAD-8: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_nop 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v8i32_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -674,51 +1090,87 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-1-LABEL: v8i32_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v8i32_arg: +; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-4-LABEL: v8i32_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v8i32_arg: +; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <8 x i32> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: +define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -729,29 +1181,51 @@ define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -762,33 +1236,55 @@ define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x i16> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg: +define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -800,29 +1296,55 @@ define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -834,33 +1356,59 @@ define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x i32> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg: +define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -872,29 +1420,55 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -906,33 +1480,59 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x float> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: +define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -943,43 +1543,51 @@ define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %o ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s5 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-PRELOAD-2-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s5 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-PRELOAD-8-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -990,47 +1598,55 @@ define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %o ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s9 -; GFX90a-PRELOAD-2-NEXT: global_store_byte v1, v2, s[6:7] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v1, v0, s[6:7] +; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s9 -; GFX90a-PRELOAD-8-NEXT: global_store_byte v1, v2, s[6:7] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v1, v0, s[6:7] +; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <5 x i8> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: +define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v5f64_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -1052,53 +1668,95 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 +; GFX940-PRELOAD-1-LABEL: v5f64_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 +; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_nop 1 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v5f64_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_nop 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 +; GFX940-PRELOAD-4-LABEL: v5f64_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 +; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_nop 1 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v5f64_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[12:13] +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_nop 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v5f64_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -1120,57 +1778,99 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-1-LABEL: v5f64_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v5f64_arg: +; GFX90a-PRELOAD-2: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-4-LABEL: v5f64_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v5f64_arg: +; GFX90a-PRELOAD-8: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <5 x double> %in, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: +define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 @@ -1179,57 +1879,43 @@ define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 24 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 16 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 16 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 @@ -1238,52 +1924,40 @@ define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 24 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 16 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 24 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 16 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <8 x i8> %in, ptr addrspace(1) %out ret void @@ -1300,22 +1974,44 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; +; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; +; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg: @@ -1328,22 +2024,44 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; +; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; +; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i64 %a, ptr addrspace(1) %out, align 8 ret void @@ -1360,22 +2078,44 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; +; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg: +; GFX940-PRELOAD-1: ; %bb.0: +; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-1-NEXT: s_endpgm +; ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; +; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg: +; GFX940-PRELOAD-4: ; %bb.0: +; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-4-NEXT: s_endpgm +; ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg: @@ -1388,1137 +2128,47 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; +; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg: +; GFX90a-PRELOAD-1: ; %bb.0: +; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-1-NEXT: s_endpgm +; ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; +; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg: +; GFX90a-PRELOAD-4: ; %bb.0: +; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-4-NEXT: s_endpgm +; ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store double %in, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half %in) #0 { -; GFX940-NO-PRELOAD-LABEL: half_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: half_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store half %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bfloat %in) #0 { -; GFX940-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store bfloat %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 x bfloat> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store <2 x bfloat> %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store <3 x bfloat> %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 x bfloat> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store <6 x bfloat> %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) #0 { -; GFX940-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s10, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10 -; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s7 -; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s11 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9 -; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[10:11] offset:12 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s10, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[6:7] -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s3 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x20 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s3 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[10:11] offset:12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[0:1] offset:12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store half %in, ptr addrspace(1) %out - store <7 x bfloat> %in2, ptr addrspace(1) %out2 - ret void -} - -define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) #0 { -; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 1 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: i1_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 1 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 1 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store i1 %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 %in) #0 { -; GFX940-NO-PRELOAD-LABEL: fp128_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: fp128_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s13 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s13 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store fp128 %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s5 -; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s5 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9 -; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 -; GFX90a-PRELOAD-2-NEXT: global_store_short v2, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v2, v0, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 -; GFX90a-PRELOAD-8-NEXT: global_store_short v2, v1, s[6:7] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v2, v0, s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store <7 x i8> %in, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x half> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v7half_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v1, s[2:3] offset:12 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s9 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9 -; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: v7half_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v1, s[6:7] offset:12 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] offset:12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store <7 x half> %in, ptr addrspace(1) %out - ret void -} - -; Test when previous argument was not dword aligned. -define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) #0 { -; GFX940-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s7 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0xc -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s3 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[6:7] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[10:11] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store i16 %in, ptr addrspace(1) %out - store i32 %in2, ptr addrspace(1) %out2 - ret void -} - -define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) #0 { -; GFX940-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: s_load_dword s7, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s7 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-2-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-PRELOAD-8-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s3 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v4, s[6:7] -; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s8 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v4, s[6:7] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s8 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v4, s[6:7] -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store i16 %in, ptr addrspace(1) %out - store <3 x i32> %in2, ptr addrspace(1) %out2 - ret void -} - -define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) #0 { -; GFX940-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[10:11] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store i16 %in, ptr addrspace(1) %out - store i16 %in2, ptr addrspace(1) %out2 - ret void -} - -define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) #0 { -; GFX940-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: -; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-2-NEXT: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 -; GFX940-PRELOAD-2-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX940-PRELOAD-8-NEXT: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-PRELOAD-8-NEXT: global_store_short v1, v2, s[2:3] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] sc0 sc1 -; GFX940-PRELOAD-8-NEXT: s_endpgm -; -; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: -; GFX90a-NO-PRELOAD: ; %bb.0: -; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 -; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3] -; GFX90a-NO-PRELOAD-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-2-NEXT: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1] -; GFX90a-PRELOAD-2-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 -; GFX90a-PRELOAD-8-NEXT: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 -; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v2, s[6:7] -; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[10:11] -; GFX90a-PRELOAD-8-NEXT: s_endpgm - store i16 %in, ptr addrspace(1) %out - store <2 x i8> %in2, ptr addrspace(1) %out2 - ret void -} - -attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll index 6fdc0d5834ef6..0d88466fc31b3 100644 --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -513,8 +513,8 @@ define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(ptr addr ; ; GCN-LABEL: alloca_promote_atomicrmw_private_lds_promote: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -555,8 +555,8 @@ define amdgpu_kernel void @alloca_promote_cmpxchg_private(ptr addrspace(1) %out, ; ; GCN-LABEL: alloca_promote_cmpxchg_private: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index b6afb7cf8c9a1..cf7efed46cef5 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -14,13 +14,13 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s3 +; GFX8-NEXT: s_add_u32 s36, s36, s9 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -95,13 +95,13 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -165,14 +165,14 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s3 +; GFX10-NEXT: s_add_u32 s36, s36, s9 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -233,15 +233,15 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: clmem_read_simplified: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -346,13 +346,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s3 +; GFX8-NEXT: s_add_u32 s36, s36, s9 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -473,13 +473,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 +; GFX900-NEXT: s_add_u32 s36, s36, s9 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -589,14 +589,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s3 +; GFX10-NEXT: s_add_u32 s36, s36, s9 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -701,13 +701,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 +; GFX90A-NEXT: s_add_u32 s36, s36, s9 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -811,15 +811,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: clmem_read: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0 ; GFX11-NEXT: s_movk_i32 s1, 0x7f @@ -1033,13 +1033,13 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s3 +; GFX8-NEXT: s_add_u32 s36, s36, s9 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1119,13 +1119,13 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1176,14 +1176,14 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s3 +; GFX10-NEXT: s_add_u32 s36, s36, s9 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1238,15 +1238,15 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: Address32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1348,13 +1348,13 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s3 +; GFX8-NEXT: s_add_u32 s36, s36, s9 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1401,13 +1401,13 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1450,14 +1450,14 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s3 +; GFX10-NEXT: s_add_u32 s36, s36, s9 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1496,15 +1496,15 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: Offset64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1574,13 +1574,13 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s3 +; GFX8-NEXT: s_add_u32 s36, s36, s9 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1624,13 +1624,13 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1669,14 +1669,14 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s3 +; GFX10-NEXT: s_add_u32 s36, s36, s9 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1709,15 +1709,15 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: p32Offset64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1781,13 +1781,13 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s42, -1 ; GFX8-NEXT: s_mov_b32 s43, 0xe80000 -; GFX8-NEXT: s_add_u32 s40, s40, s3 +; GFX8-NEXT: s_add_u32 s40, s40, s9 ; GFX8-NEXT: s_addc_u32 s41, s41, 0 -; GFX8-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43] @@ -1844,13 +1844,13 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s42, -1 ; GFX9-NEXT: s_mov_b32 s43, 0xe00000 -; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: s_add_u32 s40, s40, s9 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] @@ -1903,14 +1903,14 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX10-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s42, -1 ; GFX10-NEXT: s_mov_b32 s43, 0x31c16000 -; GFX10-NEXT: s_add_u32 s40, s40, s3 +; GFX10-NEXT: s_add_u32 s40, s40, s9 ; GFX10-NEXT: s_addc_u32 s41, s41, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX10-NEXT: s_mov_b64 s[2:3], s[42:43] @@ -1958,15 +1958,15 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; ; GFX11-LABEL: DiffBase: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff8000, v0 @@ -2051,13 +2051,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s3 +; GFX8-NEXT: s_add_u32 s36, s36, s9 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2132,13 +2132,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2201,14 +2201,14 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s3 +; GFX10-NEXT: s_add_u32 s36, s36, s9 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2273,15 +2273,15 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: ReverseOrder: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2387,13 +2387,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s3 +; GFX8-NEXT: s_add_u32 s36, s36, s9 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2429,13 +2429,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2470,14 +2470,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s3 +; GFX10-NEXT: s_add_u32 s36, s36, s9 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2507,15 +2507,15 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; ; GFX11-LABEL: negativeoffset: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index 5bb260c09c9dd..9a8d5acfbe3e9 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) { ; SDAG-LABEL: buffers_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a ; ; GISEL-LABEL: buffers_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -50,7 +50,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { ; SDAG-LABEL: buffers_from_flat_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-NEXT: s_mov_b32 s7, 0 ; SDAG-NEXT: s_mov_b32 s6, 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -69,7 +69,7 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr ; ; GISEL-LABEL: buffers_from_flat_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: s_mov_b32 s7, 0 ; GISEL-NEXT: s_mov_b32 s6, 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -110,7 +110,7 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) { ; SDAG-LABEL: buffers_might_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac ; ; GISEL-LABEL: buffers_might_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -173,7 +173,7 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; SDAG-LABEL: independent_offsets: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; ; GISEL-LABEL: independent_offsets: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GISEL-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll index 74bad5ea3edce..92465420a1ae7 100644 --- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -748,21 +748,21 @@ define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 { define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s2 +; VI-NEXT: v_rcp_f32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -800,21 +800,21 @@ define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s2 +; VI-NEXT: v_rcp_f32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -852,21 +852,21 @@ define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s2 +; VI-NEXT: v_rcp_f32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -904,21 +904,21 @@ define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, f define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s2 +; VI-NEXT: v_rcp_f32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -956,21 +956,21 @@ define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, f define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #2 { ; SI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s2 +; VI-NEXT: v_rcp_f32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1008,21 +1008,21 @@ define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, |s2| ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e64 v0, |s4| ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, |s2| +; VI-NEXT: v_rcp_f32_e64 v2, |s4| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1061,21 +1061,21 @@ define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float % define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_neg_rcp_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, -s2 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e64 v0, -s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_neg_rcp_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -s2 +; VI-NEXT: v_rcp_f32_e64 v2, -s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1116,21 +1116,21 @@ define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %s define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, -|s2| ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e64 v0, -|s4| ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -|s2| +; VI-NEXT: v_rcp_f32_e64 v2, -|s4| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1173,8 +1173,8 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, fl define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1188,13 +1188,13 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1 ; ; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -|s2| +; VI-NEXT: v_rcp_f32_e64 v2, -|s4| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2| +; VI-NEXT: v_mul_f32_e64 v3, s4, -|s4| ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -1244,7 +1244,7 @@ define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; SI-LABEL: s_div_arcp_2_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1254,10 +1254,10 @@ define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; ; VI-LABEL: s_div_arcp_2_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5 +; VI-NEXT: v_mul_f32_e64 v2, s4, 0.5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; SI-LABEL: s_div_arcp_k_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1309,11 +1309,11 @@ define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; ; VI-LABEL: s_div_arcp_k_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v2, s2, v0 +; VI-NEXT: v_mul_f32_e32 v2, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1355,7 +1355,7 @@ define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) ; SI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1366,11 +1366,11 @@ define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) ; ; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v2, s2, v0 +; VI-NEXT: v_mul_f32_e32 v2, s4, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index 24e420b7d657b..b1fa85f7c675b 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -3212,71 +3212,72 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %i21, ptr addrspace(1) nocapture noundef writeonly align 4 %arg, i32 noundef %arg1) #1 { ; GFX67-LABEL: compute_mad: ; GFX67: ; %bb.0: ; %bb -; GFX67-NEXT: s_load_dword s3, s[0:1], 0x6 -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4 +; GFX67-NEXT: s_load_dword s0, s[2:3], 0x6 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 +; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_load_dword s6, s[6:7], 0x1 -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX67-NEXT: s_add_i32 s3, s3, 1 -; GFX67-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_and_b32 s6, s6, 0xffff -; GFX67-NEXT: s_mul_i32 s2, s2, s6 -; GFX67-NEXT: v_add_i32_e32 v2, vcc, s3, v1 +; GFX67-NEXT: s_add_i32 s0, s0, 1 +; GFX67-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX67-NEXT: v_add_i32_e32 v2, vcc, s0, v1 ; GFX67-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX67-NEXT: s_load_dword s2, s[10:11], 0x1 +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX67-NEXT: v_mul_lo_u32 v3, v2, v1 -; GFX67-NEXT: s_mov_b32 s3, 0xf000 -; GFX67-NEXT: s_mov_b32 s2, 0 +; GFX67-NEXT: s_waitcnt lgkmcnt(0) +; GFX67-NEXT: s_and_b32 s2, s2, 0xffff ; GFX67-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX67-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; GFX67-NEXT: s_mul_i32 s6, s6, s2 ; GFX67-NEXT: v_mul_lo_u32 v3, v1, v2 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GFX67-NEXT: s_mov_b32 s6, 0 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX67-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX67-NEXT: v_mov_b32_e32 v2, s5 +; GFX67-NEXT: v_mov_b32_e32 v2, s1 ; GFX67-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, v3, v1 ; GFX67-NEXT: v_mul_lo_u32 v4, v3, v1 -; GFX67-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX67-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX67-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, v4, v3 -; GFX67-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX67-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: compute_mad: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x18 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x18 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: s_add_i32 s3, s3, 1 -; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s3, v1 +; GFX8-NEXT: s_add_i32 s0, s0, 1 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s3, 0xffff +; GFX8-NEXT: s_load_dword s4, s[10:11], 0x4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3 -; GFX8-NEXT: s_mul_i32 s2, s2, s1 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s1, s4, 0xffff +; GFX8-NEXT: s_mul_i32 s6, s6, s1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v3 @@ -3287,102 +3288,104 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; ; GFX900-LABEL: compute_mad: ; GFX900: ; %bb.0: ; %bb -; GFX900-NEXT: s_load_dword s3, s[0:1], 0x18 -; GFX900-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x10 +; GFX900-NEXT: s_load_dword s0, s[2:3], 0x18 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_add_i32 s3, s3, 1 -; GFX900-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, s9 -; GFX900-NEXT: v_add_u32_e32 v2, s3, v1 +; GFX900-NEXT: s_add_i32 s0, s0, 1 +; GFX900-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX900-NEXT: v_add_u32_e32 v2, s0, v1 ; GFX900-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX900-NEXT: v_add_u32_e32 v1, 1, v1 -; GFX900-NEXT: s_load_dword s3, s[6:7], 0x4 -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_load_dword s4, s[10:11], 0x4 +; GFX900-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX900-NEXT: v_mul_lo_u32 v3, v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, s1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_and_b32 s3, s3, 0xffff -; GFX900-NEXT: s_mul_i32 s2, s2, s3 +; GFX900-NEXT: s_and_b32 s1, s4, 0xffff ; GFX900-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX900-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX900-NEXT: v_add_u32_e32 v2, 1, v3 -; GFX900-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, s1 +; GFX900-NEXT: s_mul_i32 s6, s6, s1 +; GFX900-NEXT: v_add_u32_e32 v0, s6, v0 ; GFX900-NEXT: v_mul_lo_u32 v3, v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, s3 ; GFX900-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX900-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX900-NEXT: v_mad_u64_u32 v[2:3], s[2:3], v1, v3, v[1:2] -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s0, v0 +; GFX900-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v3, v[1:2] +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX900-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] -; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v1, v[2:3] -; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s8, v3 +; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, v1, v[2:3] +; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v3 ; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc ; GFX900-NEXT: global_store_dword v[1:2], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX90A-LABEL: compute_mad: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: s_load_dword s3, s[0:1], 0x18 -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x10 +; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x18 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_add_i32 s3, s3, 1 -; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX90A-NEXT: v_add_u32_e32 v2, s3, v1 -; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v0 -; GFX90A-NEXT: v_add_u32_e32 v1, 1, v1 -; GFX90A-NEXT: v_mul_lo_u32 v3, v2, v1 -; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 -; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX90A-NEXT: v_add_u32_e32 v2, 1, v3 -; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX90A-NEXT: s_load_dword s3, s[6:7], 0x4 -; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v1 -; GFX90A-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v3, v[2:3] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NEXT: s_add_i32 s4, s4, 1 +; GFX90A-NEXT: v_mul_lo_u32 v0, s4, v4 +; GFX90A-NEXT: v_add_u32_e32 v1, s4, v0 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v4 +; GFX90A-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX90A-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX90A-NEXT: v_add_u32_e32 v1, 1, v2 +; GFX90A-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX90A-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX90A-NEXT: s_load_dword s7, s[10:11], 0x4 +; GFX90A-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[2:3], v0, v2, v[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s3, s3, 0xffff -; GFX90A-NEXT: s_mul_i32 s2, s2, s3 -; GFX90A-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v2, v[4:5] +; GFX90A-NEXT: s_and_b32 s4, s7, 0xffff +; GFX90A-NEXT: s_mul_i32 s6, s6, s4 +; GFX90A-NEXT: v_add_u32_e32 v1, s6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s2, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX90A-NEXT: v_mov_b32_e32 v3, s9 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX90A-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: global_store_dword v[2:3], v0, off ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: compute_mad: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x18 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x18 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s3, s3, 1 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, s3, v1 +; GFX10-NEXT: s_add_i32 s0, s0, 1 +; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s0, v1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4 +; GFX10-NEXT: s_load_dword s4, s[10:11], 0x4 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-NEXT: s_and_b32 s4, s4, 0xffff ; GFX10-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, v2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s2, s3, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s6, s4, v[0:1] ; GFX10-NEXT: v_mul_lo_u32 v1, v3, v2 -; GFX10-NEXT: v_add_co_u32 v2, s2, s4, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, s5, 0, s2 +; GFX10-NEXT: v_add_co_u32 v2, s2, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s2 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, v[1:2] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v4, v1, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll index 0c67f00d7bebf..f57e86c68ebf9 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll @@ -35,8 +35,6 @@ define <4 x float> @needs_extimg(float noundef %0, float noundef %1, <8 x i32> n ; IR: define void @caller( define void @caller(float noundef %0, float noundef %1, <8 x i32> noundef %2, <4 x i32> noundef %3) { - ; EXTIMG: call void @needs_extimg( - ; NOEXTIMG: call void null call void @needs_extimg(float %0, float %1, <8 x i32> %2, <4 x i32> %3) ; IR: ret void ret void @@ -45,3 +43,6 @@ define void @caller(float noundef %0, float noundef %1, <8 x i32> noundef %2, <4 declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) attributes #0 = { "target-features"="+extended-image-insts" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; EXTIMG: {{.*}} +; NOEXTIMG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll index a0380c82d9aaf..e0b694ee58f0e 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll @@ -115,11 +115,6 @@ @ConstantExpr = internal global i64 ptrtoint (ptr @needs_dpp to i64) define void @needs_dpp(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #0 { -; GFX7-NOT: define void @needs_dpp( -; GFX8: define void @needs_dpp( -; GFX9: define void @needs_dpp( -; GFX10: define void @needs_dpp( -; GFX11: define void @needs_dpp( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -139,11 +134,6 @@ endif: } define void @needs_16bit_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #1 { -; GFX7-NOT: define void @needs_16bit_insts( -; GFX8: define void @needs_16bit_insts( -; GFX9: define void @needs_16bit_insts( -; GFX10: define void @needs_16bit_insts( -; GFX11: define void @needs_16bit_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -163,11 +153,6 @@ endif: } define void @needs_gfx8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #2 { -; GFX7-NOT: define void @needs_gfx8_insts( -; GFX8: define void @needs_gfx8_insts( -; GFX9: define void @needs_gfx8_insts( -; GFX10: define void @needs_gfx8_insts( -; GFX11: define void @needs_gfx8_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -187,11 +172,6 @@ endif: } define void @needs_gfx9_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #3 { -; GFX7-NOT: define void @needs_gfx9_insts( -; GFX8-NOT: define void @needs_gfx9_insts( -; GFX9: define void @needs_gfx9_insts( -; GFX10: define void @needs_gfx9_insts( -; GFX11: define void @needs_gfx9_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -211,11 +191,6 @@ endif: } define void @needs_gfx10_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #4 { -; GFX7-NOT: define void @needs_gfx10_insts( -; GFX8-NOT: define void @needs_gfx10_insts( -; GFX9-NOT: define void @needs_gfx10_insts( -; GFX10: define void @needs_gfx10_insts( -; GFX11: define void @needs_gfx10_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -235,11 +210,6 @@ endif: } define void @needs_gfx11_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #5 { -; GFX7-NOT: define void @needs_gfx11_insts( -; GFX8-NOT: define void @needs_gfx11_insts( -; GFX9-NOT: define void @needs_gfx11_insts( -; GFX10-NOT: define void @needs_gfx11_insts( -; GFX11: define void @needs_gfx11_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -259,34 +229,18 @@ endif: } define void @needs_dot1_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #6 { -; GFX7-NOT: define void @needs_dot1_insts( -; GFX8-NOT: define void @needs_dot1_insts( -; GFX9: define void @needs_dot1_insts( -; GFX10: define void @needs_dot1_insts( -; GFX11-NOT: define void @needs_dot1_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot2_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #7 { -; GFX7-NOT: define void @needs_dot2_insts( -; GFX8-NOT: define void @needs_dot2_insts( -; GFX9: define void @needs_dot2_insts( -; GFX10: define void @needs_dot2_insts( -; GFX11-NOT: define void @needs_dot2_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot3_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #8 { -; GFX7-NOT: define void @needs_dot3_insts( -; GFX8-NOT: define void @needs_dot3_insts( -; GFX906-NOT: define void @needs_dot3_insts( -; GFX90A: define void @needs_dot3_insts( -; GFX10-NOT: define void @needs_dot3_insts( -; GFX11-NOT: define void @needs_dot3_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void @@ -294,58 +248,30 @@ define void @needs_dot3_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #8 { define void @needs_dot4_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #9 { -; GFX7-NOT: define void @needs_dot4_insts( -; GFX8-NOT: define void @needs_dot4_insts( -; GFX906-NOT: define void @needs_dot4_insts( -; GFX90A: define void @needs_dot4_insts( -; GFX10-NOT: define void @needs_dot4_insts( -; GFX11-NOT: define void @needs_dot4_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot5_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #10 { -; GFX7-NOT: define void @needs_dot5_insts( -; GFX8-NOT: define void @needs_dot5_insts( -; GFX906-NOT: define void @needs_dot5_insts( -; GFX90A: define void @needs_dot5_insts( -; GFX10: define void @needs_dot5_insts( -; GFX11: define void @needs_dot5_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot6_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #11 { -; GFX7-NOT: define void @needs_dot6_insts( -; GFX8-NOT: define void @needs_dot6_insts( -; GFX906-NOT: define void @needs_dot6_insts( -; GFX90A: define void @needs_dot6_insts( -; GFX10: define void @needs_dot6_insts( -; GFX11-NOT: define void @needs_dot6_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot7_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #12 { -; GFX7-NOT: define void @needs_dot7_insts( -; GFX8-NOT: define void @needs_dot7_insts( -; GFX9: define void @needs_dot7_insts( -; GFX10: define void @needs_dot7_insts( -; GFX11: define void @needs_dot7_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #13 { -; GFX7-NOT: define void @needs_dot8_insts( -; GFX8-NOT: define void @needs_dot8_insts( -; GFX9-NOT: define void @needs_dot8_insts( -; GFX10-NOT: define void @needs_dot8_insts( -; GFX11: define void @needs_dot8_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void @@ -353,95 +279,22 @@ define void @needs_dot8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #13 { ; IR: define void @caller( define void @caller(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) { - ; GFX7: call void null( - ; GFX8: call void @needs_dpp( - ; GFX9: call void @needs_dpp( - ; GFX10: call void @needs_dpp( - ; GFX11: call void @needs_dpp( call void @needs_dpp(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void @needs_16bit_insts( - ; GFX9: call void @needs_16bit_insts( - ; GFX10: call void @needs_16bit_insts( - ; GFX11: call void @needs_16bit_insts( call void @needs_16bit_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void @needs_gfx8_insts( - ; GFX9: call void @needs_gfx8_insts( - ; GFX10: call void @needs_gfx8_insts( - ; GFX11: call void @needs_gfx8_insts( call void @needs_gfx8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX9: call void @needs_gfx9_insts( - ; GFX10: call void @needs_gfx9_insts( ; GFX111: call void @needs_gfx9_insts(c call void @needs_gfx9_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX9: call void null( - ; GFX10: call void @needs_gfx10_insts( ; GFX111: call void @needs_gfx10_insts( call void @needs_gfx10_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX9: call void null( - ; GFX10: call void null( - ; GFX11: call void @needs_gfx11_insts( call void @needs_gfx11_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX9: call void @needs_dot1_insts( - ; GFX10: call void @needs_dot1_insts( - ; GFX11: call void null( call void @needs_dot1_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX9: call void @needs_dot2_insts( - ; GFX10: call void @needs_dot2_insts( - ; GFX11: call void null( call void @needs_dot2_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX906: call void null( - ; GFX90A: call void @needs_dot3_insts( - ; GFX10: call void null( - ; GFX11: call void null( call void @needs_dot3_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX906: call void null( - ; GFX90A: call void @needs_dot4_insts( - ; GFX10: call void null( - ; GFX11: call void null( call void @needs_dot4_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX906: call void null( - ; GFX90A: call void @needs_dot5_insts( - ; GFX10: call void @needs_dot5_insts( - ; GFX11: call void @needs_dot5_insts( call void @needs_dot5_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX906: call void null( - ; GFX90A: call void @needs_dot6_insts( - ; GFX10: call void @needs_dot6_insts( - ; GFX11: call void null( call void @needs_dot6_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX9: call void @needs_dot7_insts( - ; GFX10: call void @needs_dot7_insts( - ; GFX11: call void @needs_dot7_insts( call void @needs_dot7_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; GFX7: call void null( - ; GFX8: call void null( - ; GFX9: call void null( - ; GFX10: call void null( - ; GFX11: call void @needs_dot8_insts( call void @needs_dot8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) - ; IR: ret void ret void } @@ -459,3 +312,12 @@ attributes #10 = { "target-features"="+dot5-insts" } attributes #11 = { "target-features"="+dot6-insts" } attributes #12 = { "target-features"="+dot7-insts" } attributes #13 = { "target-features"="+dot8-insts" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} +; GFX7: {{.*}} +; GFX8: {{.*}} +; GFX9: {{.*}} +; GFX906: {{.*}} +; GFX90A: {{.*}} +; IR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll index 594fad389b6b9..2b1e3999a8aa8 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll @@ -38,10 +38,7 @@ define void @needs_gws(i32 %val0, i32 %val1) #0 { ; IR: define void @gws_caller( define void @gws_caller(i32 %val0, i32 %val1) { - ; COMPATIBLE: call void @needs_gws( - ; INCOMPATIBLE: call void null call void @needs_gws(i32 %val0, i32 %val1) - ; IR: ret void ret void } @@ -52,3 +49,7 @@ declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2 attributes #0 = { "target-features"="+gws"} attributes #1 = { convergent inaccessiblememonly nounwind } attributes #2 = { convergent inaccessiblememonly nounwind writeonly } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; COMPATIBLE: {{.*}} +; INCOMPATIBLE: {{.*}} +; IR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll index 2c2401f120cf5..32fed3ba22c59 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll @@ -41,8 +41,6 @@ define i64 @needs_s_memrealtime() #0 { ; IR: define void @s_memrealtime_caller( define i64 @s_memrealtime_caller() { - ; REALTIME: call i64 @needs_s_memrealtime( - ; NOREALTIME: call i64 null %t = call i64 @needs_s_memrealtime() ; IR: ret i64 %t ret i64 %t @@ -57,8 +55,6 @@ define i64 @needs_s_memtime() #1 { ; IR: define void @s_memtime_caller( define i64 @s_memtime_caller() { - ; MEMTIME: call i64 @needs_s_memtime( - ; NOMEMTIME: call i64 null %t = call i64 @needs_s_memtime() ; IR: ret i64 %t ret i64 %t @@ -70,3 +66,10 @@ declare i64 @llvm.amdgcn.s.memtime() attributes #0 = { "target-features"="+s-memrealtime"} attributes #1 = { "target-features"="+s-memtime-inst"} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; COMPATIBLE: {{.*}} +; INCOMPATIBLE: {{.*}} +; MEMTIME: {{.*}} +; NOMEMTIME: {{.*}} +; NOREALTIME: {{.*}} +; REALTIME: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index a87973d93ac77..fdce4431fbbf2 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -21,7 +21,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s3, 32, s3 @@ -35,7 +35,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotl_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -47,7 +47,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotl_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s3, 32, s3 @@ -57,7 +57,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: rotl_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s3, 32, s3 @@ -95,8 +95,8 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -111,8 +111,8 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX8-LABEL: rotl_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, 32, s6 ; GFX8-NEXT: s_sub_i32 s3, 32, s7 @@ -128,22 +128,22 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: rotl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s0, 32, s7 -; GFX10-NEXT: s_sub_i32 s1, 32, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s0 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: s_sub_i32 s2, 32, s7 +; GFX10-NEXT: s_sub_i32 s3, 32, s6 +; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s7 @@ -188,8 +188,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,8 +210,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX8-LABEL: rotl_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s9 ; GFX8-NEXT: s_sub_i32 s9, 32, s11 @@ -233,26 +233,26 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotl_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s0, 32, s8 -; GFX10-NEXT: s_sub_i32 s1, 32, s9 +; GFX10-NEXT: s_sub_i32 s2, 32, s8 +; GFX10-NEXT: s_sub_i32 s3, 32, s9 ; GFX10-NEXT: s_sub_i32 s8, 32, s11 ; GFX10-NEXT: s_sub_i32 s9, 32, s10 ; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s8 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s9 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s2 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s8 diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 058ee589bc4b0..0e1dd69d930ae 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -19,7 +19,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,7 +32,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotr_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 @@ -43,7 +43,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotr_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 @@ -52,7 +52,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: rotr_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 @@ -84,8 +84,8 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -98,8 +98,8 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX8-LABEL: rotr_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 @@ -113,20 +113,20 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: rotr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s7 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s6 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s7 @@ -161,8 +161,8 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -179,8 +179,8 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX8-LABEL: rotr_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 @@ -198,22 +198,22 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s11 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s10 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s9 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s8 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s11 diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll index 846fbdb33d668..40a8592dba6df 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -20,7 +20,7 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: rsq_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -38,7 +38,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; GCN-IEEE-UNSAFE-LABEL: rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -56,7 +56,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; GCN-DAZ-SAFE-LABEL: rsq_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; SI-IEEE-SAFE-LABEL: rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -134,7 +134,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; CI-IEEE-SAFE-LABEL: rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -198,39 +198,39 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) { ; GCN-DAZ-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1 +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s4 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm ; ; GCN-IEEE-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1 +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s4 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm ; ; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-DAZ-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s2 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s0 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -245,20 +245,21 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-DAZ-SAFE-NEXT: s_endpgm ; ; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s2 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 @@ -288,15 +289,15 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va ; ; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb -; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb +; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s2 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 @@ -366,7 +367,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-UNSAFE-NEXT: s_endpgm ; GCN-DAZ-UNSAFE-LABEL: rsqrt_fmul: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, 0 ; GCN-DAZ-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -390,7 +391,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-IEEE-UNSAFE-LABEL: rsqrt_fmul: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, 0 ; GCN-IEEE-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -414,7 +415,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-DAZ-SAFE-LABEL: rsqrt_fmul: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0 ; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -465,7 +466,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-IEEE-SAFE-LABEL: rsqrt_fmul: ; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0 ; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -532,7 +533,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -551,7 +552,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -570,7 +571,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; GCN-DAZ-SAFE-LABEL: neg_rsq_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -605,7 +606,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; SI-IEEE-SAFE-LABEL: neg_rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -648,7 +649,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; CI-IEEE-SAFE-LABEL: neg_rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -713,7 +714,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -732,7 +733,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -751,7 +752,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -786,7 +787,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -829,7 +830,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll index 0b58b95050524..78ea3b3699f2a 100644 --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -10,7 +10,7 @@ ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[VRESULT]] ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) #0 { %add = add i32 %b, 65 store i32 %add, ptr addrspace(1) %out ret void @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) { ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k0_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %a, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k0_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %a, i32 %b) #0 { %add0 = add i32 %a, 65 %add1 = add i32 %b, 65 store i32 %add0, ptr addrspace(1) %out0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @s_addk_i32_k0_x2(ptr addrspace(1) %out0, ptr addrspac ; SI-LABEL: {{^}}s_addk_i32_k1: ; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}} ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k1(ptr addrspace(1) %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k1(ptr addrspace(1) %out, i32 %b) #0 { %add = add i32 %b, 32767 ; (1 << 15) - 1 store i32 %add, ptr addrspace(1) %out ret void @@ -40,7 +40,7 @@ define amdgpu_kernel void @s_addk_i32_k1(ptr addrspace(1) %out, i32 %b) { ; SI-LABEL: {{^}}s_addk_i32_k2: ; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k2(ptr addrspace(1) %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k2(ptr addrspace(1) %out, i32 %b) #0 { %add = add i32 %b, -17 store i32 %add, ptr addrspace(1) %out ret void @@ -49,7 +49,7 @@ define amdgpu_kernel void @s_addk_i32_k2(ptr addrspace(1) %out, i32 %b) { ; SI-LABEL: {{^}}s_addk_i32_k3: ; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}} ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 { %add = add i32 %b, -65 store i32 %add, ptr addrspace(1) %out ret void @@ -60,7 +60,7 @@ define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) { ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 ; SI: s_endpgm ; Note: dummy argument here to prevent combining of descriptor loads for %out and %b -define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) { +define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) #0 { %add = add <2 x i32> %b, store <2 x i32> %add, ptr addrspace(1) %out ret void @@ -72,7 +72,7 @@ define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_v4i32_k0(ptr addrspace(1) %out, <4 x i32> %b) { +define amdgpu_kernel void @s_addk_v4i32_k0(ptr addrspace(1) %out, <4 x i32> %b) #0 { %add = add <4 x i32> %b, store <4 x i32> %add, ptr addrspace(1) %out ret void @@ -88,7 +88,7 @@ define amdgpu_kernel void @s_addk_v4i32_k0(ptr addrspace(1) %out, <4 x i32> %b) ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_v8i32_k0(ptr addrspace(1) %out, <8 x i32> %b) { +define amdgpu_kernel void @s_addk_v8i32_k0(ptr addrspace(1) %out, <8 x i32> %b) #0 { %add = add <8 x i32> %b, store <8 x i32> %add, ptr addrspace(1) %out ret void @@ -97,7 +97,7 @@ define amdgpu_kernel void @s_addk_v8i32_k0(ptr addrspace(1) %out, <8 x i32> %b) ; SI-LABEL: {{^}}no_s_addk_i32_k0: ; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}} ; SI: s_endpgm -define amdgpu_kernel void @no_s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) { +define amdgpu_kernel void @no_s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) #0 { %add = add i32 %b, 32768 ; 1 << 15 store i32 %add, ptr addrspace(1) %out ret void @@ -116,5 +116,5 @@ define amdgpu_kernel void @commute_s_addk_i32(ptr addrspace(1) %out, i32 %b) #0 declare i32 @llvm.amdgcn.groupstaticsize() #1 -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 0492c5663e666..35a5210d1c790 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -30,8 +30,8 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) { ; GCN-LABEL: v_sad_u32_constant_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x5a ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 @@ -55,8 +55,8 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -79,12 +79,12 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_sub_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] -; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 s8, s8, s7 -; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] +; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 ; GCN-NEXT: s_min_u32 s0, s0, s1 @@ -93,7 +93,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -115,19 +115,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_add_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] -; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 s8, s8, s7 -; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] +; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 -; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -147,19 +147,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_max_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] -; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 s8, s8, s7 -; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] +; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -182,19 +182,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_min_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] -; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 s8, s8, s7 -; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] +; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -218,19 +218,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_sub_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] -; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 s8, s8, s7 -; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] +; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -251,12 +251,12 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_select_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] -; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 s8, s8, s7 -; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] +; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 ; GCN-NEXT: s_sub_i32 s6, s1, s0 @@ -266,7 +266,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -286,9 +286,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -322,9 +322,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -356,11 +356,11 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) { ; GCN-LABEL: v_sad_u32_i16_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s6, 0xffff +; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 @@ -387,7 +387,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: flat_load_ushort v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: flat_load_ushort v1, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_load_ushort v2, v[0:1] glc @@ -415,8 +415,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) { ; GCN-LABEL: v_sad_u32_i8_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -446,7 +446,7 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: flat_load_ubyte v1, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_load_ubyte v2, v[0:1] glc @@ -474,8 +474,8 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { ; GCN-LABEL: s_sad_u32_i8_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s4, s2, 8 ; GCN-NEXT: s_and_b32 s3, s2, 0xff @@ -505,8 +505,8 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GCN-LABEL: v_sad_u32_mismatched_operands_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s6, s0, s1 ; GCN-NEXT: s_cmp_le_u32 s0, s1 @@ -534,8 +534,8 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) % define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GCN-LABEL: v_sad_u32_mismatched_operands_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s3 ; GCN-NEXT: s_sub_i32 s6, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index bd3c422b52efc..684279a3776fc 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -15,8 +15,8 @@ declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: saddo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -38,8 +38,8 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: saddo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: s_add_u32 s2, s6, s0 @@ -59,20 +59,20 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX9-LABEL: saddo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_add_u32 s2, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s7, s3 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_addc_u32 s3, s7, s1 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -80,26 +80,26 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX10-LABEL: saddo_i64_zext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s6, s2 -; GFX10-NEXT: s_addc_u32 s1, s7, s3 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7] -; GFX10-NEXT: s_xor_b32 s2, s2, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: s_add_u32 s2, s6, s0 +; GFX10-NEXT: s_addc_u32 s3, s7, s1 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7] +; GFX10-NEXT: s_xor_b32 s0, s0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: saddo_i64_zext: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s6, s0 @@ -128,34 +128,34 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_saddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_add_i32 s12, s8, s9 -; SI-NEXT: s_cmp_lt_i32 s9, 0 -; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; SI-NEXT: s_cmp_lt_i32 s12, s8 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_add_i32 s14, s12, s13 +; SI-NEXT: s_cmp_lt_i32 s13, 0 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cmp_lt_i32 s14, s12 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_saddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_add_i32 s4, s0, s1 @@ -175,15 +175,15 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_saddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_add_i32 s0, s2, s3 -; GFX9-NEXT: v_add_i32 v1, s2, v1 clamp -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_add_i32 s1, s0, s1 +; GFX9-NEXT: v_add_i32 v1, s0, v1 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] @@ -192,12 +192,12 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-LABEL: s_saddo_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_nc_i32 v0, s2, s3 clamp -; GFX10-NEXT: s_add_i32 s0, s2, s3 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s1 clamp +; GFX10-NEXT: s_add_i32 s0, s0, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo @@ -208,8 +208,8 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-LABEL: s_saddo_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_i32 v0, s4, s5 clamp ; GFX11-NEXT: s_add_i32 s4, s4, s5 @@ -234,7 +234,7 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -264,7 +264,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_saddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -288,7 +288,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_saddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -304,7 +304,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_saddo_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_saddo_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -352,7 +352,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_saddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -379,7 +379,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_saddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_saddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s8, s4, s6 @@ -420,7 +420,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: s_saddo_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s8, s4, s6 @@ -437,7 +437,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: s_saddo_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s8, s4, s6 ; GFX11-NEXT: s_addc_u32 s9, s5, s7 @@ -465,7 +465,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -496,7 +496,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_saddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -521,7 +521,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_saddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] @@ -539,7 +539,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_saddo_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -558,7 +558,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_saddo_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -592,7 +592,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -627,7 +627,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_saddo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -656,7 +656,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_saddo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5] @@ -676,7 +676,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_saddo_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -697,7 +697,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_saddo_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index 5260a4847f70d..1700ce302cc9d 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: scalar_to_vector_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: scalar_to_vector_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -53,7 +53,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: scalar_to_vector_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -73,7 +73,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: scalar_to_vector_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -219,8 +219,8 @@ bb: define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zeroext %val) nounwind { ; SI-LABEL: scalar_to_vector_test6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -230,8 +230,8 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero ; ; VI-LABEL: scalar_to_vector_test6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index baee88b69d060..89a09dc4fcc17 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8i16: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 @@ -22,7 +22,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX906-LABEL: scalar_to_vector_v8i16: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 @@ -37,7 +37,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX908-LABEL: scalar_to_vector_v8i16: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 @@ -52,8 +52,9 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX90A-LABEL: scalar_to_vector_v8i16: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -85,7 +86,7 @@ entry: define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8f16: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 @@ -100,7 +101,7 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX906-LABEL: scalar_to_vector_v8f16: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 @@ -115,7 +116,7 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX908-LABEL: scalar_to_vector_v8f16: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 @@ -130,8 +131,9 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX90A-LABEL: scalar_to_vector_v8f16: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 5f291489848fe..ad82869c001f6 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -23,7 +23,7 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v23, s2, 0 -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 ; CHECK-NEXT: v_writelane_b32 v23, s3, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:7] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 6372d74161fad..b57a51f1382ae 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 @@ -60,7 +60,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: sdiv_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -104,7 +104,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -199,7 +199,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -220,7 +220,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -241,7 +241,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -293,7 +293,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: slow_sdiv_i32_3435: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -316,7 +316,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; ; TONGA-LABEL: slow_sdiv_i32_3435: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: slow_sdiv_i32_3435: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -391,7 +391,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -462,7 +462,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -533,7 +533,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -682,7 +682,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -707,7 +707,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v2i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -732,7 +732,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: sdiv_v2i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -791,7 +791,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_mov_b32 s6, s10 @@ -918,7 +918,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_v4i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s11, 0xf000 ; TONGA-NEXT: s_mov_b32 s10, -1 ; TONGA-NEXT: s_mov_b32 s6, s10 @@ -1045,7 +1045,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v4i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: sdiv_v4i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; TONGA-LABEL: v_sdiv_i8: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -1515,7 +1515,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1594,7 +1594,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1637,7 +1637,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i23: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1680,7 +1680,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i23: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -1783,7 +1783,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1824,7 +1824,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i24: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1865,7 +1865,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -1962,7 +1962,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i25: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 @@ -2009,7 +2009,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i25: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -2056,7 +2056,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i25: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -2189,7 +2189,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { ; GCN-LABEL: scalarize_mulhs_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -2221,7 +2221,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; ; TONGA-LABEL: scalarize_mulhs_4xi32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; ; GFX9-LABEL: scalarize_mulhs_4xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index c310e257adadc..f4776747f16ac 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -5,20 +5,20 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s8, s3, 31 -; GCN-NEXT: s_add_u32 s2, s2, s8 +; GCN-NEXT: s_ashr_i32 s8, s1, 31 +; GCN-NEXT: s_add_u32 s0, s0, s8 ; GCN-NEXT: s_mov_b32 s9, s8 -; GCN-NEXT: s_addc_u32 s3, s3, s8 -; GCN-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] +; GCN-NEXT: s_addc_u32 s1, s1, s8 +; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s4, 0, s10 ; GCN-NEXT: s_subb_u32 s5, 0, s11 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -140,8 +140,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GCN-IR-LABEL: s_test_sdiv: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 @@ -460,8 +460,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -490,8 +490,8 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -587,8 +587,8 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[0:1], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s8, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -628,8 +628,8 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -676,14 +676,14 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], 33 +; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -720,14 +720,14 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 33 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -771,8 +771,8 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -801,8 +801,8 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -838,14 +838,14 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], 39 +; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -882,14 +882,14 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 39 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -933,94 +933,94 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_sdiv24_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i64 s[2:3], s[12:13], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GCN-NEXT: s_xor_b32 s4, s4, s8 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 +; GCN-NEXT: s_ashr_i64 s[0:1], s[10:11], 40 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 +; GCN-NEXT: s_xor_b32 s1, s8, s2 +; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: s_or_b32 s7, s4, 1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-NEXT: s_cselect_b32 s4, s7, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GCN-NEXT: s_or_b32 s1, s1, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s6 -; GCN-NEXT: s_xor_b32 s4, s6, s10 -; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 +; GCN-NEXT: s_xor_b32 s0, s0, s10 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_or_b32 s6, s4, 1 +; GCN-NEXT: s_or_b32 s2, s0, 1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-NEXT: s_cselect_b32 s4, s6, 0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s0, v4 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[12:13], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s8 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[10:11], 40 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 +; GCN-IR-NEXT: s_xor_b32 s1, s8, s2 +; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: s_or_b32 s7, s4, 1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-IR-NEXT: s_cselect_b32 s4, s7, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GCN-IR-NEXT: s_or_b32 s1, s1, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-IR-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s1, v2 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s6 -; GCN-IR-NEXT: s_xor_b32 s4, s6, s10 -; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s0 +; GCN-IR-NEXT: s_xor_b32 s0, s0, s10 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b32 s6, s4, 1 +; GCN-IR-NEXT: s_or_b32 s2, s0, 1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s0, v4 ; GCN-IR-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr <2 x i64> %x, %2 = ashr <2 x i64> %y, @@ -1032,20 +1032,18 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_sdiv24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_sext_i32_i16 s5, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 +; GCN-NEXT: s_sext_i32_i16 s2, s7 +; GCN-NEXT: s_sext_i32_i16 s1, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_sext_i32_i16 s4, s7 ; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_alignbit_b32 v2, s4, v2, 24 +; GCN-NEXT: v_alignbit_b32 v2, s2, v2, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 @@ -1057,33 +1055,35 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GCN-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[4:5], 24 ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16 -; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 16 +; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31 ; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 -; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_mov_b32 s1, s0 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 ; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] ; GCN-IR-NEXT: s_mov_b32 s5, s4 -; GCN-IR-NEXT: s_sub_u32 s12, s6, s2 -; GCN-IR-NEXT: s_subb_u32 s13, s7, s2 +; GCN-IR-NEXT: s_sub_u32 s12, s6, s0 +; GCN-IR-NEXT: s_subb_u32 s13, s7, s0 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 @@ -1146,8 +1146,8 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3] +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_sdiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 @@ -1853,7 +1853,7 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1880,7 +1880,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_sdiv24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -1913,7 +1913,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -1939,7 +1939,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_sdiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 911bb44078d51..669ed915a002a 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: add_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -22,7 +22,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: add_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -36,7 +36,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: add_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -47,7 +47,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: add_shr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -65,7 +65,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: sub_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -80,7 +80,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: sub_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -94,7 +94,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: sub_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -105,7 +105,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: sub_shr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -123,8 +123,8 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; NOSDWA-LABEL: mul_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -147,8 +147,8 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: mul_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -168,12 +168,12 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: mul_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -183,13 +183,13 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-LABEL: mul_shr_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -210,8 +210,8 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -231,8 +231,8 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX89-LABEL: mul_i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -252,12 +252,12 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX9-LABEL: mul_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2 @@ -267,13 +267,13 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; GFX10-LABEL: mul_i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 @@ -293,8 +293,8 @@ entry: define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -319,8 +319,8 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -342,12 +342,12 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -357,13 +357,13 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX10-LABEL: mul_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -383,8 +383,8 @@ entry: define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -415,8 +415,8 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v4i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -441,12 +441,12 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v4i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 @@ -457,13 +457,13 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX10-LABEL: mul_v4i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v3 @@ -484,8 +484,8 @@ entry: define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -528,8 +528,8 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v8i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -560,12 +560,12 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v8i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 @@ -578,13 +578,13 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX10-LABEL: mul_v8i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v3, v3, v7 @@ -607,8 +607,8 @@ entry: define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -625,8 +625,8 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -643,12 +643,12 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -657,13 +657,13 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] @@ -679,8 +679,8 @@ entry: define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 @@ -703,8 +703,8 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v2half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -723,12 +723,12 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v2half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -737,13 +737,13 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mul_v2half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -759,8 +759,8 @@ entry: define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -789,8 +789,8 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v4half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -812,12 +812,12 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v4half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 @@ -827,13 +827,13 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mul_v4half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 @@ -850,8 +850,8 @@ entry: define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v4, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v5, s7 @@ -892,8 +892,8 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v8half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -921,12 +921,12 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v8half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 @@ -938,13 +938,13 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mul_v8half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 @@ -963,8 +963,8 @@ entry: define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v2, s7 ; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, s6, v0 @@ -983,8 +983,8 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX89-LABEL: mul_i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v2, s7 ; GFX89-NEXT: v_add_u32_e32 v1, vcc, s6, v0 @@ -1003,11 +1003,11 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX9-LABEL: mul_i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2 @@ -1017,12 +1017,12 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; GFX10-LABEL: mul_i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 @@ -1042,8 +1042,8 @@ entry: define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1070,8 +1070,8 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v2i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1093,12 +1093,12 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v2i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2 @@ -1110,13 +1110,13 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_v2i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b16 v0, 8, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1142,8 +1142,8 @@ entry: define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1182,8 +1182,8 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v4i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1209,12 +1209,12 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v4i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2 @@ -1230,13 +1230,13 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_v4i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v1 @@ -1271,8 +1271,8 @@ entry: define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1330,8 +1330,8 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v8i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1364,12 +1364,12 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v8i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v1, v3 @@ -1392,13 +1392,13 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_v8i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b16 v6, 8, v0 @@ -1449,7 +1449,7 @@ entry: define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; NOSDWA-LABEL: sitofp_v2i16_to_v2f16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -1467,7 +1467,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX89-LABEL: sitofp_v2i16_to_v2f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX9-LABEL: sitofp_v2i16_to_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX10-LABEL: sitofp_v2i16_to_v2f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1519,8 +1519,8 @@ entry: define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mac_v2half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 @@ -1543,8 +1543,8 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mac_v2half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 @@ -1566,12 +1566,12 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mac_v2half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX9-NEXT: v_pk_add_f16 v1, v1, v2 @@ -1581,13 +1581,13 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mac_v2half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX10-NEXT: v_pk_add_f16 v1, v1, v2 @@ -1605,7 +1605,7 @@ entry: define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: immediate_mul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -1625,7 +1625,7 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX89-LABEL: immediate_mul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX89-NEXT: v_mov_b32_e32 v3, 0x141 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) @@ -1644,7 +1644,7 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: immediate_mul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1657,7 +1657,7 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: immediate_mul_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1679,8 +1679,8 @@ entry: define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mulmul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1708,8 +1708,8 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX89-LABEL: mulmul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1733,12 +1733,12 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: mulmul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -1749,13 +1749,13 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: mulmul_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1777,8 +1777,8 @@ entry: define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: add_bb_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1802,8 +1802,8 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX89-LABEL: add_bb_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1822,12 +1822,12 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: add_bb_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1836,13 +1836,13 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: add_bb_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -1863,7 +1863,7 @@ store_label: define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 { ; NOSDWA-LABEL: pulled_out_test: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 @@ -1900,7 +1900,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX89-LABEL: pulled_out_test: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX89-NEXT: v_mov_b32_e32 v4, 8 ; GFX89-NEXT: v_mov_b32_e32 v5, 0xff ; GFX89-NEXT: s_waitcnt lgkmcnt(0) @@ -1929,7 +1929,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX9-LABEL: pulled_out_test: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1955,7 +1955,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX10-LABEL: pulled_out_test: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 @@ -2198,8 +2198,8 @@ bb2: define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mac_v2half_same_srcop: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 @@ -2222,8 +2222,8 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; ; GFX89-LABEL: mac_v2half_same_srcop: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -2245,11 +2245,11 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: mac_v2half_same_srcop: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v1 @@ -2261,12 +2261,12 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: mac_v2half_same_srcop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll index d807c3909e656..f11e86aef683d 100644 --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.sffbh.i32(i32) nounwind readnone speculatable define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GCN-LABEL: select_constant_cttz: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 0992e9e300f13..cc109595d8d70 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -6,35 +6,35 @@ define amdgpu_kernel void @select_f16( ; SI-LABEL: select_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; SI-NEXT: s_mov_b32 s18, s14 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s19, s15 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s22, s2 -; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s22, s14 +; SI-NEXT: s_mov_b32 s23, s15 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s14 +; SI-NEXT: s_mov_b32 s11, s15 +; SI-NEXT: s_mov_b32 s2, s14 +; SI-NEXT: s_mov_b32 s3, s15 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc +; SI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -42,50 +42,50 @@ define amdgpu_kernel void @select_f16( ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 -; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s15, 0xf000 +; VI-NEXT: s_mov_b32 s14, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_mov_b32 s18, s14 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s19, s15 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 -; VI-NEXT: s_mov_b32 s22, s2 -; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s22, s14 +; VI-NEXT: s_mov_b32 s23, s15 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s14 +; VI-NEXT: s_mov_b32 s11, s15 +; VI-NEXT: s_mov_b32 s2, s14 +; VI-NEXT: s_mov_b32 s3, s15 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s5 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[12:15], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: select_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 ; GFX11-NEXT: s_mov_b32 s14, -1 ; GFX11-NEXT: s_mov_b32 s15, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s14 @@ -139,7 +139,7 @@ entry: define amdgpu_kernel void @select_f16_imm_a( ; SI-LABEL: select_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -174,7 +174,7 @@ define amdgpu_kernel void @select_f16_imm_a( ; ; VI-LABEL: select_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -205,7 +205,7 @@ define amdgpu_kernel void @select_f16_imm_a( ; ; GFX11-LABEL: select_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -252,7 +252,7 @@ entry: define amdgpu_kernel void @select_f16_imm_b( ; SI-LABEL: select_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -287,7 +287,7 @@ define amdgpu_kernel void @select_f16_imm_b( ; ; VI-LABEL: select_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -318,7 +318,7 @@ define amdgpu_kernel void @select_f16_imm_b( ; ; GFX11-LABEL: select_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -365,7 +365,7 @@ entry: define amdgpu_kernel void @select_f16_imm_c( ; SI-LABEL: select_f16_imm_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -400,7 +400,7 @@ define amdgpu_kernel void @select_f16_imm_c( ; ; VI-LABEL: select_f16_imm_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -432,7 +432,7 @@ define amdgpu_kernel void @select_f16_imm_c( ; ; GFX11-LABEL: select_f16_imm_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -479,7 +479,7 @@ entry: define amdgpu_kernel void @select_f16_imm_d( ; SI-LABEL: select_f16_imm_d: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -514,7 +514,7 @@ define amdgpu_kernel void @select_f16_imm_d( ; ; VI-LABEL: select_f16_imm_d: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -546,7 +546,7 @@ define amdgpu_kernel void @select_f16_imm_d( ; ; GFX11-LABEL: select_f16_imm_d: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -593,31 +593,31 @@ entry: define amdgpu_kernel void @select_v2f16( ; SI-LABEL: select_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s18, s14 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s19, s15 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s22, s2 -; SI-NEXT: s_mov_b32 s23, s3 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s22, s14 +; SI-NEXT: s_mov_b32 s23, s15 +; SI-NEXT: s_mov_b32 s2, s14 +; SI-NEXT: s_mov_b32 s3, s15 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s10, s14 +; SI-NEXT: s_mov_b32 s11, s15 ; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 ; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -642,36 +642,36 @@ define amdgpu_kernel void @select_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_mov_b32 s15, 0xf000 +; VI-NEXT: s_mov_b32 s14, -1 +; VI-NEXT: s_mov_b32 s2, s14 +; VI-NEXT: s_mov_b32 s3, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s18, s14 +; VI-NEXT: s_mov_b32 s19, s15 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 -; VI-NEXT: s_mov_b32 s22, s2 -; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s22, s14 +; VI-NEXT: s_mov_b32 s23, s15 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s10, s14 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s11, s15 ; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s5 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -686,14 +686,14 @@ define amdgpu_kernel void @select_v2f16( ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: select_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[12:13], s[2:3], 0x44 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s2 @@ -754,7 +754,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_a( ; SI-LABEL: select_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -801,7 +801,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; ; VI-LABEL: select_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -839,7 +839,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; ; GFX11-LABEL: select_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -895,7 +895,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_b( ; SI-LABEL: select_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -942,7 +942,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; ; VI-LABEL: select_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -980,7 +980,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; ; GFX11-LABEL: select_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1036,7 +1036,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_c( ; SI-LABEL: select_v2f16_imm_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1083,7 +1083,7 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; VI-LABEL: select_v2f16_imm_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; GFX11-LABEL: select_v2f16_imm_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s10 @@ -1179,7 +1179,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_d( ; SI-LABEL: select_v2f16_imm_d: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; VI-LABEL: select_v2f16_imm_d: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; GFX11-LABEL: select_v2f16_imm_d: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s10 diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll index c00cd763992d9..cc82f532fc477 100644 --- a/llvm/test/CodeGen/AMDGPU/setcc.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc.ll @@ -463,4 +463,4 @@ entry: ret void } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll index 2169ee117cbaa..31a802b7428b9 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -25,9 +25,9 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: sext_i16_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -47,8 +47,8 @@ define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @sext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -68,8 +68,8 @@ define amdgpu_kernel void @sext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: sext_i16_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -89,9 +89,9 @@ define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a, i64 %b) { ; GCN-LABEL: sext_i32_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s6, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -111,8 +111,8 @@ define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @sext_i32_to_i64_divergent(ptr addrspace(1) %out, i32 %a, i64 %b) { ; GCN-LABEL: sext_i32_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index b67ecc2f9d13c..0630cca7c099b 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,18 +11,18 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB0_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_add_i32 s7, s7, s2 +; SI-NEXT: s_add_i32 s7, s7, s0 ; SI-NEXT: s_cbranch_execnz .LBB0_3 ; SI-NEXT: .LBB0_2: ; %if ; SI-NEXT: s_sub_i32 s7, s5, s6 ; SI-NEXT: .LBB0_3: ; %endif -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_add_i32 s4, s7, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -56,23 +56,23 @@ endif: define amdgpu_kernel void @sgpr_if_else_salu_br_opt(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br_opt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s2, s[0:1], 0x2e -; SI-NEXT: s_load_dword s3, s[0:1], 0x37 +; SI-NEXT: s_load_dword s0, s[2:3], 0x2e +; SI-NEXT: s_load_dword s1, s[2:3], 0x37 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s5, s2, s3 +; SI-NEXT: s_add_i32 s5, s0, s1 ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %if -; SI-NEXT: s_load_dword s2, s[0:1], 0x1c -; SI-NEXT: s_load_dword s3, s[0:1], 0x25 +; SI-NEXT: s_load_dword s0, s[2:3], 0x1c +; SI-NEXT: s_load_dword s1, s[2:3], 0x25 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s5, s2, s3 +; SI-NEXT: s_add_i32 s5, s0, s1 ; SI-NEXT: .LBB1_3: ; %endif -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_add_i32 s4, s5, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -108,28 +108,28 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_br(ptr addrspace(1) %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_valu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xc +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xc ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; SI-NEXT: s_cbranch_execz .LBB2_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s8, s6, s7 ; SI-NEXT: .LBB2_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; SI-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_xor_b64 exec, exec, s[2:3] +; SI-NEXT: s_xor_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execz .LBB2_4 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s4, s4, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB2_4: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -158,8 +158,8 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll index 8abd4b4302f54..3d8807a88a46c 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll @@ -4,7 +4,7 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3 target triple = "amdgcn-amd-amdhsa" ; CHECK-LABEL: {{^}}t0: -; CHECK: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0 +; CHECK: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[6:7], 0x0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] ; There should be no redundant copies from PTR_HI. ; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index 21fcd3cd0dcd6..37cf76103aa94 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -164,4 +164,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.x() #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } +attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index bdc607552a0df..6de015c6de79b 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -391,8 +391,7 @@ define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(ptr add } declare i32 @llvm.amdgcn.workitem.id.x() #0 - declare i32 @llvm.amdgcn.workgroup.id.x() #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } +attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index 4b02d00ddce1e..ebc916b5c889b 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -181,7 +181,7 @@ define i128 @v_ashr_i128_kv(i128 %rhs) { define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -214,7 +214,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -247,7 +247,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -430,7 +430,7 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 @@ -502,7 +502,7 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 @@ -574,7 +574,7 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index b3f4790df4d48..47ab5ba666877 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workgroup.id.x() #0 define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -70,7 +70,7 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -92,7 +92,7 @@ define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -140,7 +140,7 @@ define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -159,7 +159,7 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: shl_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -214,40 +214,40 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @shl_i16_v_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) { ; SI-LABEL: shl_i16_v_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s12, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_v_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s12, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s12, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_v_s: @@ -287,42 +287,42 @@ define amdgpu_kernel void @shl_i16_v_s(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) { ; SI-LABEL: shl_i16_v_compute_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s12, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_v_compute_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s12, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s12, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_v_compute_s: @@ -370,7 +370,7 @@ define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i16_computed_amount: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -396,7 +396,7 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: shl_i16_computed_amount: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -472,8 +472,8 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { ; SI-LABEL: shl_i16_i_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -484,8 +484,8 @@ define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { ; ; VI-LABEL: shl_i16_i_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -530,7 +530,7 @@ define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -561,7 +561,7 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -630,7 +630,7 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -659,7 +659,7 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -752,7 +752,7 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -770,7 +770,7 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: shl_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -819,7 +819,7 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -839,7 +839,7 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -903,7 +903,7 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -929,7 +929,7 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; VI-NEXT: s_mov_b32 s19, 0xf000 @@ -1029,8 +1029,8 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: s_shl_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1041,8 +1041,8 @@ define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a ; ; VI-LABEL: s_shl_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1070,34 +1070,34 @@ define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a define amdgpu_kernel void @v_shl_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_shl_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_ashr_i32 s3, s2, 31 -; SI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_ashr_i32 s7, s6, 31 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] -; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_shl_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_ashr_i32 s3, s2, 31 -; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_ashr_i32 s7, s6, 31 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s6, s0 -; VI-NEXT: s_addc_u32 s3, s7, s1 +; VI-NEXT: s_add_u32 s2, s2, s4 +; VI-NEXT: s_addc_u32 s3, s3, s5 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1137,7 +1137,7 @@ define amdgpu_kernel void @v_shl_32_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { ; SI-LABEL: s_shl_constant_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s9, 0xffff ; SI-NEXT: s_mov_b32 s8, s6 @@ -1153,7 +1153,7 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: s_shl_constant_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s9, 0xffff ; VI-NEXT: s_mov_b32 s8, s6 @@ -1195,7 +1195,7 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_constant_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1215,7 +1215,7 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: v_shl_constant_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_i64_32_bit_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_shl_i64_32_bit_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -1331,7 +1331,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_inline_imm_64_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1349,7 +1349,7 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_shl_inline_imm_64_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -1394,8 +1394,8 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_64_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1407,8 +1407,8 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: s_shl_inline_imm_64_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1444,8 +1444,8 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_1_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1457,8 +1457,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: s_shl_inline_imm_1_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1495,8 +1495,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_1_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1508,8 +1508,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_1_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1542,8 +1542,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_1_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1555,8 +1555,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_1_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1589,8 +1589,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_0_5_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1602,8 +1602,8 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_0_5_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1636,8 +1636,8 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_0_5_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1649,8 +1649,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_0_5_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1683,8 +1683,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_2_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1696,8 +1696,8 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_2_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1730,8 +1730,8 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_2_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1743,8 +1743,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_2_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1777,8 +1777,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1790,8 +1790,8 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1824,8 +1824,8 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1837,8 +1837,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1874,8 +1874,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_f32_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1887,8 +1887,8 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_f32_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1926,32 +1926,32 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s0, -4.0 -; SI-NEXT: s_mov_b32 s1, -1 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xd +; SI-NEXT: s_mov_b32 s4, -4.0 +; SI-NEXT: s_mov_b32 s5, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s0, -4.0 -; VI-NEXT: s_mov_b32 s1, -1 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s4, -4.0 +; VI-NEXT: s_mov_b32 s5, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: @@ -1982,32 +1982,32 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_mov_b32 s1, 4.0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xd +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 4.0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_mov_b32 s1, 4.0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, 4.0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64: @@ -2033,32 +2033,32 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %o define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_mov_b32 s1, -4.0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0xd +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, -4.0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dword s2, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_mov_b32 s1, -4.0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, -4.0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1 define amdgpu_kernel void @test_mul2(i32 %p) { ; SI-LABEL: test_mul2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2096,7 +2096,7 @@ define amdgpu_kernel void @test_mul2(i32 %p) { ; ; VI-LABEL: test_mul2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index b81af3eb838f1..8c663d963b73e 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; VI-LABEL: s_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -40,7 +40,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; CI-LABEL: s_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -59,7 +59,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -71,7 +71,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX11-LABEL: s_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -90,7 +90,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -120,7 +120,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -142,7 +142,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -153,7 +153,9 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -178,20 +180,20 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -212,21 +214,21 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dword s0, s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s0, s8, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -235,9 +237,10 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: shl_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -247,9 +250,12 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: shl_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -271,20 +277,20 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -305,21 +311,21 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dword s0, s[2:3], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s0, s8, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshl_b32_e32 v2, s8, v2 -; CI-NEXT: v_lshl_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshl_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshl_b32_e32 v3, s1, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -328,9 +334,10 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: shl_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -340,9 +347,12 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: shl_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -364,7 +374,7 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -375,7 +385,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -395,7 +405,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -416,7 +426,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -427,7 +437,9 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -450,7 +462,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -461,7 +473,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -481,7 +493,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -498,7 +510,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -509,7 +521,9 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -532,7 +546,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -544,7 +558,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -566,7 +580,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -595,7 +609,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -607,7 +621,9 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_shl_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -633,7 +649,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -645,7 +661,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -669,7 +685,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -692,7 +708,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -704,7 +720,9 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index c5fc51091704b..ddf331816694a 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_x_sub_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_x_sub_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -65,7 +65,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_x_sub_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -84,7 +84,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -95,7 +95,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i32_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -106,7 +106,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i32_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -129,7 +131,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -151,7 +153,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -173,7 +175,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -196,7 +198,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -221,7 +223,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -238,7 +240,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -255,7 +257,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -287,7 +291,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_64_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -303,7 +307,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_64_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -319,7 +323,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_64_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -336,7 +340,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_64_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -355,7 +359,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_64_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -366,7 +370,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i32_64_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -377,7 +381,9 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i32_64_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -400,7 +406,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_65: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -416,7 +422,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_x_sub_65: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -432,7 +438,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_x_sub_65: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -449,7 +455,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_x_sub_65: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -468,7 +474,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -479,7 +485,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -490,7 +496,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -501,7 +507,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -512,7 +518,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -525,7 +533,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -548,7 +558,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_65_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -564,7 +574,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_65_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -580,7 +590,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_65_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -597,7 +607,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_65_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -616,7 +626,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_65_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -627,7 +637,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i32_65_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -638,7 +648,9 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i32_65_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -661,7 +673,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -677,7 +689,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_x_sub_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -693,7 +705,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_x_sub_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -710,7 +722,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_x_sub_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -729,7 +741,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -740,7 +752,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -751,7 +763,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -762,7 +774,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -773,7 +785,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -786,7 +800,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -809,7 +825,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_neg16_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -825,7 +841,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_neg16_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -841,7 +857,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_neg16_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -858,7 +874,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_neg16_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -877,7 +893,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_i32_neg16_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -888,7 +904,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: v_test_i32_neg16_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -899,7 +915,9 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_i32_neg16_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -922,7 +940,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_neg17: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -938,7 +956,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_x_sub_neg17: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -954,7 +972,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_x_sub_neg17: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -971,7 +989,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_x_sub_neg17: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -990,7 +1008,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1001,7 +1019,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -1012,7 +1030,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1023,7 +1041,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -1034,7 +1052,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1047,7 +1067,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1070,7 +1092,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_neg17_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1086,7 +1108,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_neg17_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1102,7 +1124,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_neg17_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1119,7 +1141,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_neg17_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1138,7 +1160,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_i32_neg17_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1149,7 +1171,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: v_test_i32_neg17_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1160,7 +1182,9 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_i32_neg17_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1183,7 +1207,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; SI-LABEL: s_test_i32_x_sub_64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s0, s0, 64 ; SI-NEXT: ;;#ASMSTART @@ -1193,7 +1217,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; VI-LABEL: s_test_i32_x_sub_64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sub_i32 s0, s0, 64 ; VI-NEXT: ;;#ASMSTART @@ -1203,7 +1227,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX9-LABEL: s_test_i32_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_i32 s0, s0, 64 ; GFX9-NEXT: ;;#ASMSTART @@ -1213,7 +1237,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX10-LABEL: s_test_i32_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s0, s0, 64 ; GFX10-NEXT: ;;#ASMSTART @@ -1223,7 +1247,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX11-LABEL: s_test_i32_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s0, s0, 64 ; GFX11-NEXT: ;;#ASMSTART @@ -1238,7 +1262,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1254,7 +1278,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1270,7 +1294,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1287,7 +1311,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1306,7 +1330,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i16_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1317,7 +1341,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i16_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1328,7 +1352,9 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i16_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1351,7 +1377,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 @@ -1369,7 +1395,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1387,7 +1413,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1405,7 +1431,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1425,7 +1451,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1437,7 +1463,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1450,7 +1476,9 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1477,7 +1505,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1499,7 +1527,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1521,7 +1549,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1544,7 +1572,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1569,7 +1597,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -1586,7 +1614,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1603,7 +1631,9 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1635,7 +1665,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1654,7 +1684,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1676,7 +1706,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1696,7 +1726,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1718,7 +1748,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1729,7 +1759,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1740,7 +1770,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_test_v2i16_x_sub_64_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1763,7 +1795,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1782,7 +1814,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1804,7 +1836,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1824,7 +1856,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1846,7 +1878,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1858,7 +1890,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x400007 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1870,7 +1902,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1881,7 +1913,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_test_v2i16_x_sub_7_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1904,7 +1938,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1923,7 +1957,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1945,7 +1979,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff85 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1965,7 +1999,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7b ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1987,7 +2021,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1999,7 +2033,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7b0040 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2011,7 +2045,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_123: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2022,7 +2056,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_sub_64_123: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2046,7 +2082,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2064,7 +2100,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2084,7 +2120,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2103,7 +2139,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2125,7 +2161,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2136,7 +2172,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2147,7 +2183,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_v2i16_x_sub_7_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2171,7 +2209,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2187,7 +2225,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2208,7 +2246,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2227,7 +2265,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2248,7 +2286,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2259,7 +2297,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2270,7 +2308,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2293,7 +2333,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2309,7 +2349,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2330,7 +2370,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2349,7 +2389,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2370,7 +2410,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -2382,7 +2422,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 35 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2394,7 +2434,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2405,7 +2445,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2428,7 +2470,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2444,7 +2486,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2465,7 +2507,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2484,7 +2526,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2505,7 +2547,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -2517,7 +2559,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 34 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2529,7 +2571,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2540,7 +2582,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2564,7 +2608,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2583,7 +2627,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2605,7 +2649,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 32 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -2625,7 +2669,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_not_b32_e32 v4, 31 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2647,7 +2691,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2658,7 +2702,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2669,7 +2713,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2692,7 +2738,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2708,7 +2754,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2729,7 +2775,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2748,7 +2794,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2769,7 +2815,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2780,7 +2826,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2791,7 +2837,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2814,7 +2862,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2832,7 +2880,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2852,7 +2900,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2871,7 +2919,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2893,7 +2941,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2904,7 +2952,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2915,7 +2963,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2939,7 +2989,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2958,7 +3008,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2980,7 +3030,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, -16 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3000,7 +3050,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, -16 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3022,7 +3072,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3033,7 +3083,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3044,7 +3094,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3067,7 +3119,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3083,7 +3135,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3104,7 +3156,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3123,7 +3175,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3144,7 +3196,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3155,7 +3207,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3166,7 +3218,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3189,7 +3243,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3207,7 +3261,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3227,7 +3281,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3246,7 +3300,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3268,7 +3322,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3279,7 +3333,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3290,7 +3344,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3313,7 +3369,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3332,7 +3388,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3354,7 +3410,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3374,7 +3430,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3396,7 +3452,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3408,7 +3464,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc400c400 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3420,7 +3476,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3431,7 +3487,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3442,7 +3498,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3455,7 +3513,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3478,7 +3538,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3497,7 +3557,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3519,7 +3579,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3539,7 +3599,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3561,7 +3621,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3573,7 +3633,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x44004400 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3585,7 +3645,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3596,7 +3656,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3607,7 +3667,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3620,7 +3682,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3643,7 +3707,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3662,7 +3726,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3684,7 +3748,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3704,7 +3768,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3726,7 +3790,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3737,7 +3801,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3748,7 +3812,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3771,7 +3837,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3790,7 +3856,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3812,7 +3878,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3832,7 +3898,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3854,7 +3920,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3865,7 +3931,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3876,7 +3942,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3899,7 +3967,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3916,7 +3984,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3935,7 +4003,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3953,7 +4021,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3975,7 +4043,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3986,7 +4054,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3997,7 +4065,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4020,7 +4090,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4037,7 +4107,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -4054,7 +4124,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4071,7 +4141,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4093,7 +4163,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -4104,7 +4174,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_not_b32_e32 v2, 31 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4116,7 +4186,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -4127,7 +4197,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -4138,7 +4208,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4151,7 +4223,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll index 1ab63762ecbd7..9f3596359a662 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -5,22 +5,22 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: break_inserted_outside_of_loop: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s2, v0 +; SI-NEXT: v_and_b32_e32 v0, s0, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: .LBB0_1: ; %ENDIF ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_and_b64 s[4:5], exec, vcc -; SI-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB0_1 ; SI-NEXT: ; %bb.2: ; %ENDLOOP -; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -30,22 +30,22 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; ; FLAT-LABEL: break_inserted_outside_of_loop: ; FLAT: ; %bb.0: ; %main_body -; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c +; FLAT-NEXT: s_load_dword s0, s[2:3], 0x2c ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_and_b32_e32 v0, s2, v0 +; FLAT-NEXT: v_and_b32_e32 v0, s0, v0 ; FLAT-NEXT: v_and_b32_e32 v0, 1, v0 ; FLAT-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; FLAT-NEXT: s_mov_b64 s[2:3], 0 +; FLAT-NEXT: s_mov_b64 s[0:1], 0 ; FLAT-NEXT: .LBB0_1: ; %ENDIF ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_and_b64 s[4:5], exec, vcc -; FLAT-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; FLAT-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] ; FLAT-NEXT: s_cbranch_execnz .LBB0_1 ; FLAT-NEXT: ; %bb.2: ; %ENDLOOP -; FLAT-NEXT: s_or_b64 exec, exec, s[2:3] -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_or_b64 exec, exec, s[0:1] +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: v_mov_b32_e32 v0, 0 @@ -71,23 +71,23 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SI-NEXT: s_cbranch_execz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s0, 0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_and_b64 s[4:5], s[0:1], exec +; SI-NEXT: s_cmp_eq_u32 s2, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec ; SI-NEXT: .LBB1_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: .LBB1_3: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5] -; SI-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SI-NEXT: s_and_b64 s[2:3], exec, s[4:5] +; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: ; %bb.4: ; %exit ; SI-NEXT: s_endpgm @@ -96,23 +96,23 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; FLAT: ; %bb.0: ; %entry ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; FLAT-NEXT: s_mov_b64 s[2:3], 0 +; FLAT-NEXT: s_mov_b64 s[0:1], 0 ; FLAT-NEXT: s_mov_b64 s[4:5], 0 ; FLAT-NEXT: s_and_saveexec_b64 s[6:7], vcc ; FLAT-NEXT: s_cbranch_execz .LBB1_2 ; FLAT-NEXT: ; %bb.1: ; %else -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x24 +; FLAT-NEXT: s_load_dword s2, s[2:3], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_cmp_eq_u32 s0, 0 -; FLAT-NEXT: s_cselect_b64 s[0:1], -1, 0 -; FLAT-NEXT: s_and_b64 s[4:5], s[0:1], exec +; FLAT-NEXT: s_cmp_eq_u32 s2, 0 +; FLAT-NEXT: s_cselect_b64 s[2:3], -1, 0 +; FLAT-NEXT: s_and_b64 s[4:5], s[2:3], exec ; FLAT-NEXT: .LBB1_2: ; %endif ; FLAT-NEXT: s_or_b64 exec, exec, s[6:7] ; FLAT-NEXT: .LBB1_3: ; %loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5] -; FLAT-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; FLAT-NEXT: s_and_b64 s[2:3], exec, s[4:5] +; FLAT-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] ; FLAT-NEXT: s_cbranch_execnz .LBB1_3 ; FLAT-NEXT: ; %bb.4: ; %exit ; FLAT-NEXT: s_endpgm @@ -166,12 +166,12 @@ declare float @llvm.fabs.f32(float) nounwind readnone define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { ; SI-LABEL: loop_land_info_assert: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xa +; SI-NEXT: s_load_dword s0, s[2:3], 0xa ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_i32 s2, 4 +; SI-NEXT: s_cmp_lt_i32 s0, 4 ; SI-NEXT: s_cbranch_scc1 .LBB3_4 ; SI-NEXT: ; %bb.1: ; %for.cond.preheader -; SI-NEXT: s_load_dword s0, s[0:1], 0xc +; SI-NEXT: s_load_dword s0, s[2:3], 0xc ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmpk_lt_i32 s0, 0x3e8 ; SI-NEXT: s_cbranch_scc0 .LBB3_4 @@ -186,12 +186,12 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; ; FLAT-LABEL: loop_land_info_assert: ; FLAT: ; %bb.0: ; %entry -; FLAT-NEXT: s_load_dword s2, s[0:1], 0x28 +; FLAT-NEXT: s_load_dword s0, s[2:3], 0x28 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_cmp_lt_i32 s2, 4 +; FLAT-NEXT: s_cmp_lt_i32 s0, 4 ; FLAT-NEXT: s_cbranch_scc1 .LBB3_4 ; FLAT-NEXT: ; %bb.1: ; %for.cond.preheader -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x30 +; FLAT-NEXT: s_load_dword s0, s[2:3], 0x30 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: s_cmpk_lt_i32 s0, 0x3e8 ; FLAT-NEXT: s_cbranch_scc0 .LBB3_4 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll index 7c5537747dd7b..e64dcb74267dd 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test(i32 %arg, i32 %arg1) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_eq_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index f9a17783f0d35..1d183210f9538 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -43,14 +43,14 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-LABEL: kernel: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x10 -; CHECK-NEXT: s_load_dword s10, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s0, s[6:7], 0x10 +; CHECK-NEXT: s_load_dword s10, s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmpk_lg_i32 s0, 0x100 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 ; CHECK-NEXT: ; %bb.1: ; %if.else ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 -; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc @@ -65,7 +65,7 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_and_b64 s[2:3], s[2:3], exec ; CHECK-NEXT: .LBB0_5: ; %Flow2 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] +; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccz .LBB0_8 ; CHECK-NEXT: s_branch .LBB0_7 ; CHECK-NEXT: .LBB0_6: @@ -77,15 +77,15 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_13 ; CHECK-NEXT: .LBB0_8: ; %Flow4 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; CHECK-NEXT: .LBB0_9: ; %UnifiedUnreachableBlock ; CHECK-NEXT: ; divergent unreachable ; CHECK-NEXT: .LBB0_10: ; %Flow6 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; %if.end6.sink.split -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, s10 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -96,13 +96,14 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], exec ; CHECK-NEXT: s_trap 2 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; CHECK-NEXT: s_cbranch_execnz .LBB0_9 ; CHECK-NEXT: s_branch .LBB0_10 ; CHECK-NEXT: .LBB0_14: ; %cond.false.i8 ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 + entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 2c0f64f85d823..5536a09538e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -469,4 +469,4 @@ entry: } attributes #0 = { nounwind } -attributes #1 = { nounwind noinline } +attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index 9a03d216c7a99..b54df3b4d0c6c 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,15 +39,13 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mul_i32 s4, s6, s7 -; SI-NEXT: s_add_i32 s4, s4, s8 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_add_i32 s4, s4, s6 ; SI-NEXT: s_ashr_i32 s5, s4, 31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -56,15 +54,13 @@ define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mul_i32 s4, s6, s7 -; VI-NEXT: s_add_i32 s4, s4, s8 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_add_i32 s4, s4, s6 ; VI-NEXT: s_ashr_i32 s5, s4, 31 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -81,7 +77,7 @@ entry: define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -96,7 +92,7 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -117,8 +113,8 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -130,8 +126,8 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun ; ; VI-LABEL: s_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -148,7 +144,7 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -166,7 +162,7 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -190,8 +186,8 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind { ; SI-LABEL: s_sext_i16_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -203,8 +199,8 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun ; ; VI-LABEL: s_sext_i16_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -221,7 +217,7 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,7 +231,7 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -259,8 +255,8 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; SI-LABEL: s_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -275,8 +271,8 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; ; VI-LABEL: s_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -299,15 +295,13 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: v_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_cmp_eq_u32 s7, s8 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 +; SI-NEXT: s_cmp_eq_u32 s5, s6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -316,15 +310,13 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_cmp_eq_u32 s7, s8 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 +; VI-NEXT: s_cmp_eq_u32 s5, s6 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -350,8 +342,8 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -375,8 +367,8 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n ; ; VI-LABEL: s_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -415,7 +407,7 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -443,7 +435,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: v_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -487,7 +479,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: s_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -513,7 +505,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) ; ; VI-LABEL: s_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -552,7 +544,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -580,7 +572,7 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index 539cfc71a80f9..e86ee1adef3d0 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -42,25 +42,30 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; ; GFX9-LABEL: test_simple_indirect_call: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x4 ; GFX9-NEXT: s_add_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mul_lo_u32 v0, s4, v0 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, indirect@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, indirect@rel32@hi+12 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0 -; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: ds_write_b64 v0, v[3:4] -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s8, s8, 16 +; GFX9-NEXT: s_mul_i32 s8, s8, s9 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, indirect@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, indirect@rel32@hi+12 +; GFX9-NEXT: v_mad_u32_u24 v3, v1, s9, v3 +; GFX9-NEXT: v_add_lshl_u32 v5, v3, v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-NEXT: ds_write_b64 v5, v[3:4] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: s_endpgm %fptr = alloca ptr, addrspace(5) %fptr.cast = addrspacecast ptr addrspace(5) %fptr to ptr diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll index 5a241f85b2e2c..ba1caf376975c 100644 --- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -837,5 +837,5 @@ entry: ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]] ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind } -; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) } +; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index d1f05358ff13a..b872112922204 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -7,8 +7,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: sint_to_fp_i32_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -18,8 +18,8 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: sint_to_fp_i32_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -36,8 +36,8 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: sint_to_fp_i1_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -50,8 +50,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: sint_to_fp_i1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -70,8 +70,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) { ; CI-LABEL: sint_to_fp_i1_f64_load: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitcmp1_b32 s2, 0 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -84,8 +84,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; ; VI-LABEL: sint_to_fp_i1_f64_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: s_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -116,7 +116,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; ; VI-LABEL: s_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -134,7 +134,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CI-LABEL: v_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -153,7 +153,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -181,8 +181,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) { ; CI-LABEL: s_sint_to_fp_i8_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 @@ -193,8 +193,8 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; ; VI-LABEL: s_sint_to_fp_i8_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i32 s2, s2, 0x80000 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -230,8 +230,8 @@ define double @v_sint_to_fp_i8_to_f64(i8 %in) { define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_select_sint_to_fp_i1_vals_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -244,8 +244,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_sint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -281,8 +281,8 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_select_sint_to_fp_i1_vals_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -295,8 +295,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_sint_to_fp_i1_vals_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -351,8 +351,8 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_swap_select_sint_to_fp_i1_vals_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -365,8 +365,8 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; ; VI-LABEL: s_swap_select_sint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll index b03726817c1b4..3b35b2d3d9865 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_sint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -32,7 +32,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_sint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s5, s2, s3 ; GFX8-NEXT: s_flbit_i32 s4, s3 @@ -54,7 +54,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_sint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s4, s2, s3 @@ -87,7 +87,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -116,7 +116,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_sint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -146,14 +146,15 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_sint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2 -; GFX11-NEXT: v_cls_i32_e32 v4, v2 +; GFX11-NEXT: v_xor_b32_e32 v3, v0, v1 +; GFX11-NEXT: v_cls_i32_e32 v4, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4 @@ -161,16 +162,17 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3 ; GFX11-NEXT: v_min_u32_e32 v3, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] -; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 +; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -186,7 +188,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_sint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -209,7 +211,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_sint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s5, s2, s3 ; GFX8-NEXT: s_flbit_i32 s4, s3 @@ -230,7 +232,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_sint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s4, s2, s3 @@ -261,7 +263,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -289,7 +291,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_sint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -318,14 +320,16 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_sint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2 -; GFX11-NEXT: v_cls_i32_e32 v4, v2 +; GFX11-NEXT: v_xor_b32_e32 v3, v0, v1 +; GFX11-NEXT: v_cls_i32_e32 v4, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4 @@ -333,15 +337,15 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3 ; GFX11-NEXT: v_min_u32_e32 v3, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] -; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 +; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -357,8 +361,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -391,8 +395,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s3, s6, s7 ; GFX8-NEXT: s_flbit_i32 s2, s7 @@ -426,8 +430,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s6, s7 @@ -467,7 +471,7 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -534,7 +538,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -603,22 +607,24 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4 -; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2 +; GFX11-NEXT: v_xor_b32_e32 v9, v2, v3 +; GFX11-NEXT: v_xor_b32_e32 v11, v0, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v13, v7, v8 -; GFX11-NEXT: v_xor_b32_e32 v15, v5, v6 -; GFX11-NEXT: v_cls_i32_e32 v10, v4 -; GFX11-NEXT: v_cls_i32_e32 v12, v2 -; GFX11-NEXT: v_cls_i32_e32 v14, v8 -; GFX11-NEXT: v_cls_i32_e32 v16, v6 +; GFX11-NEXT: v_xor_b32_e32 v13, v6, v7 +; GFX11-NEXT: v_xor_b32_e32 v15, v4, v5 +; GFX11-NEXT: v_cls_i32_e32 v10, v3 +; GFX11-NEXT: v_cls_i32_e32 v12, v1 +; GFX11-NEXT: v_cls_i32_e32 v14, v7 +; GFX11-NEXT: v_cls_i32_e32 v16, v5 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13 @@ -638,33 +644,33 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_min_u32_e32 v11, v14, v13 ; GFX11-NEXT: v_min_u32_e32 v12, v16, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] -; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] -; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 -; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 -; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 -; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 -; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 -; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 +; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 4, v0 -; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 -; GFX11-NEXT: v_ldexp_f32 v2, v1, v10 -; GFX11-NEXT: v_ldexp_f32 v1, v6, v11 -; GFX11-NEXT: v_ldexp_f32 v0, v4, v5 -; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX11-NEXT: v_cvt_f32_i32_e32 v5, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v8 +; GFX11-NEXT: v_ldexp_f32 v3, v2, v9 +; GFX11-NEXT: v_ldexp_f32 v2, v0, v10 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 +; GFX11-NEXT: v_ldexp_f32 v0, v5, v4 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -680,8 +686,8 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -718,8 +724,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s3, s6, s7 ; GFX8-NEXT: s_flbit_i32 s2, s7 @@ -756,8 +762,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s6, s7 @@ -802,7 +808,7 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -877,7 +883,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -952,22 +958,24 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4 -; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2 +; GFX11-NEXT: v_xor_b32_e32 v9, v2, v3 +; GFX11-NEXT: v_xor_b32_e32 v11, v0, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v13, v7, v8 -; GFX11-NEXT: v_xor_b32_e32 v15, v5, v6 -; GFX11-NEXT: v_cls_i32_e32 v10, v4 -; GFX11-NEXT: v_cls_i32_e32 v12, v2 -; GFX11-NEXT: v_cls_i32_e32 v14, v8 -; GFX11-NEXT: v_cls_i32_e32 v16, v6 +; GFX11-NEXT: v_xor_b32_e32 v13, v6, v7 +; GFX11-NEXT: v_xor_b32_e32 v15, v4, v5 +; GFX11-NEXT: v_cls_i32_e32 v10, v3 +; GFX11-NEXT: v_cls_i32_e32 v12, v1 +; GFX11-NEXT: v_cls_i32_e32 v14, v7 +; GFX11-NEXT: v_cls_i32_e32 v16, v5 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13 @@ -987,41 +995,41 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_min_u32_e32 v11, v14, v13 ; GFX11-NEXT: v_min_u32_e32 v12, v16, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] -; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] -; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 -; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 -; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 -; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 -; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 -; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v8 +; GFX11-NEXT: v_ldexp_f32 v2, v2, v9 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v10 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v10 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_ldexp_f32 v2, v2, v11 -; GFX11-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v1 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3 -; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2 +; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2 +; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index b4b0d960e12e5..b08a35ab80732 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; SI-LABEL: sitofp_i16_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; ; VI-LABEL: sitofp_i16_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; ; GFX11-LABEL: sitofp_i16_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @sitofp_i32_to_f16( ; SI-LABEL: sitofp_i32_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; ; VI-LABEL: sitofp_i32_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; ; GFX11-LABEL: sitofp_i32_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ entry: define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; SI-LABEL: sitofp_v2i16_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -168,7 +168,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; VI-LABEL: sitofp_v2i16_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -188,7 +188,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX11-LABEL: sitofp_v2i16_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -221,7 +221,7 @@ entry: define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; SI-LABEL: sitofp_v2i32_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -244,7 +244,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; ; VI-LABEL: sitofp_v2i32_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -266,7 +266,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; ; GFX11-LABEL: sitofp_v2i32_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -301,19 +301,21 @@ entry: define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sint_to_fp_i1_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -321,26 +323,26 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sint_to_fp_i1_to_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -348,16 +350,14 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_sint_to_fp_i1_to_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll index 233f4cc4fee50..fbb9ba0b73846 100644 --- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll +++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll @@ -2,10 +2,10 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX940 %s -define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) %out) { +define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; GFX940-LABEL: test: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -51,3 +51,5 @@ entry: ret void } declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32 immarg, i32 immarg) + +attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index f8c9827ecf7a9..93e210bb4c809 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -106,7 +106,7 @@ define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) } ; GCN-LABEL: {{^}}s_abs_v4i16: -; GFX9: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x24 +; GFX9: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x24 ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[#LOAD + 2]] ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[#LOAD + 3]] ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[#LOAD + 2]], [[SUB0]] diff --git a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll index 8b166b4c1bf3f..c54832d778434 100644 --- a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll @@ -333,7 +333,7 @@ endif: } ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16: -; GCN: s_cmp_lt_u32 s2, 0xffff8000 +; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xffff8000 define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, ptr addrspace(1) %out) #0 { entry: %cmp0 = icmp ult i32 %cond, -32768 @@ -552,7 +552,7 @@ endif: } ; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16: -; GCN: s_cmp_lt_u32 s2, 0xfffff7ff +; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xfffff7ff define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, ptr addrspace(1) %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index c9413b61758d1..804fb8f258ffd 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -121,7 +121,7 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float> declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) -attributes #1 = { nounwind "amdgpu-num-vgpr"="10" } -attributes #2 = { nounwind "amdgpu-num-vgpr"="12" } -attributes #3 = { nounwind "amdgpu-num-vgpr"="32" } -attributes #4 = { nounwind "amdgpu-num-vgpr"="6" } +attributes #1 = { nounwind "amdgpu-num-vgpr"="10" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #2 = { nounwind "amdgpu-num-vgpr"="12" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #3 = { nounwind "amdgpu-num-vgpr"="32" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #4 = { nounwind "amdgpu-num-vgpr"="6" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll index baca66a287cbf..55238b284efce 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; MUBUF-LABEL: test_inst_offset_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_add_u32 s0, s0, s15 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -24,8 +24,8 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; ; FLATSCR-LABEL: test_inst_offset_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -61,7 +61,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_kernel() { ; MUBUF-LABEL: test_sgpr_offset_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_add_u32 s0, s0, s15 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -77,8 +77,8 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() { ; ; FLATSCR-LABEL: test_sgpr_offset_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -193,7 +193,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_add_u32 s0, s0, s15 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND @@ -215,8 +215,8 @@ define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { ; ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: s_mov_b32 s8, 0 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND @@ -275,7 +275,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; MUBUF-LABEL: test_sgpr_offset_subregs_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_add_u32 s0, s0, s15 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -298,8 +298,8 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; ; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -341,7 +341,7 @@ entry: define amdgpu_kernel void @test_inst_offset_subregs_kernel() { ; MUBUF-LABEL: test_inst_offset_subregs_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_add_u32 s0, s0, s15 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -365,8 +365,8 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() { ; ; FLATSCR-LABEL: test_inst_offset_subregs_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index bea2e6d4b45a3..b9ad4615fcbcf 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -16,12 +16,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_mov_b32 s42, -1 -; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX6-NEXT: s_add_u32 s40, s40, s3 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -34,6 +29,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v0, vcc ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_mov_b32 s42, -1 +; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX6-NEXT: s_add_u32 s40, s40, s9 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 ; GFX6-NEXT: s_mov_b32 s2, 0x3fd00 ; GFX6-NEXT: s_mov_b64 s[8:9], 0x100 @@ -4987,9 +4987,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; ; GFX9-FLATSCR-LABEL: test: ; GFX9-FLATSCR: ; %bb.0: ; %entry -; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 -; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -5001,6 +4999,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill @@ -7613,11 +7613,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; ; GFX10-FLATSCR-LABEL: test: ; GFX10-FLATSCR: ; %bb.0: ; %entry -; GFX10-FLATSCR-NEXT: s_add_u32 s2, s2, s5 -; GFX10-FLATSCR-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLATSCR-NEXT: s_add_u32 s6, s6, s11 +; GFX10-FLATSCR-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -10071,10 +10071,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 ; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s42, -1 ; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX6-NEXT: s_add_u32 s40, s40, s3 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_add_u32 s40, s40, s9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 @@ -10646,14 +10646,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-FLATSCR-LABEL: test_limited_sgpr: ; GFX9-FLATSCR: ; %bb.0: ; %entry -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 8, v0 -; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:240 -; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -10830,11 +10830,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; ; GFX10-FLATSCR-LABEL: test_limited_sgpr: ; GFX10-FLATSCR: ; %bb.0: ; %entry -; GFX10-FLATSCR-NEXT: s_add_u32 s2, s2, s5 -; GFX10-FLATSCR-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GFX10-FLATSCR-NEXT: s_add_u32 s6, s6, s11 +; GFX10-FLATSCR-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 ; GFX10-FLATSCR-NEXT: s_mov_b32 s33, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll index f5e94df415ae4..5338bc8f7aa7a 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX10 %s -; The test was originally written to spill an SGPR to scratch without having spare SGPRs -; available to save exec. This scenario won't be true anymore as we reseve SGPR(s) -; upfront for saving exec. +; The test was originally written to spill an SGPR to scratch without +; having spare SGPRs available to save exec. This scenario won't be +; true anymore as we reserve SGPR(s) upfront for saving exec. define amdgpu_kernel void @test() #1 { ; GFX10-LABEL: test: @@ -12,7 +12,7 @@ define amdgpu_kernel void @test() #1 { ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX10-NEXT: s_add_u32 s8, s8, s1 +; GFX10-NEXT: s_add_u32 s8, s8, s7 ; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s[0:7] @@ -37,5 +37,8 @@ define amdgpu_kernel void @test() #1 { ret void } +; FIXME: amdgpu-no attributese are a workaround for cases where the +; number of incoming arguments is larger than the number of permitted +; registers. attributes #0 = { nounwind } -attributes #1 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } +attributes #1 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" "amdgpu-no-queue-ptr" "amdgpu-no-dispatch-id" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index d5f97314f9324..b4a981f1db4ec 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -5,17 +5,17 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-LABEL: name: test_spill_av_class ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: liveins: $sgpr6_sgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) + ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %24.sub0 - ; GCN-NEXT: SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %30.sub0 + ; GCN-NEXT: SI_SPILL_V64_SAVE %30, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %16:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]] ; GCN-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll index c1c69ce568a9c..bc13b8d033017 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll @@ -44,7 +44,7 @@ define void @device_writelane_intrinsic(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @kernel_writelane_intrinsic(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GCN-LABEL: kernel_writelane_intrinsic: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v1, 45 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index b8cf692372069..cd06a060a50cd 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -94,7 +94,7 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -147,7 +147,7 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -175,7 +175,7 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -243,7 +243,7 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -282,7 +282,7 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -395,8 +395,8 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_ashr_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -409,8 +409,8 @@ define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_ashr_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -443,7 +443,7 @@ entry: define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_i64_2: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -461,7 +461,7 @@ define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_i64_2: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -513,7 +513,7 @@ entry: define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -533,7 +533,7 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -597,7 +597,7 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -623,7 +623,7 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s10, s2 @@ -714,9 +714,9 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[0:1], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -730,9 +730,9 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[0:1], 0x50 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[2:3], 0x50 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -768,7 +768,7 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_ashr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -785,7 +785,7 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_ashr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -833,9 +833,9 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[0:1], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s6, s[2:3], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,9 +849,9 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[0:1], 0x50 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[2:3], 0x50 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -887,7 +887,7 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -905,7 +905,7 @@ define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index bcc67e974ae4a..abf013e39eefa 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i16_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[2:3] @@ -25,7 +25,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i16_7: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -49,7 +49,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i16_7: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -113,7 +113,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -149,7 +149,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TAHITI-LABEL: srem_i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -192,7 +192,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: srem_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -277,7 +277,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v1, v0, s[2:3] @@ -292,7 +292,7 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -314,7 +314,7 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -363,7 +363,7 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v1, v0, s[2:3] @@ -381,7 +381,7 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i32_7: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -406,7 +406,7 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i32_7: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -459,7 +459,7 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -521,7 +521,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v2i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -590,7 +590,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -723,7 +723,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -747,7 +747,7 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v2i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -778,7 +778,7 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v2i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -842,7 +842,7 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] offset:16 @@ -958,7 +958,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v4i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -1081,7 +1081,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v4i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s4, s2, 16 ; TONGA-NEXT: s_addc_u32 s5, s3, 0 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -1355,7 +1355,7 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v4i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -1491,7 +1491,7 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] @@ -1675,7 +1675,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TAHITI-LABEL: srem_i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v4, 0 @@ -1836,7 +1836,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: srem_i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v4, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s6 @@ -2589,7 +2589,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -2606,7 +2606,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -2630,7 +2630,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -2684,7 +2684,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 @@ -3039,7 +3039,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v2i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v8, 0 @@ -3346,7 +3346,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v2i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 16 @@ -4733,7 +4733,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -4757,7 +4757,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v2i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -4788,7 +4788,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v2i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -4860,7 +4860,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[10:13], v8, s[6:7] offset:32 @@ -5486,7 +5486,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v4i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v8, 0 @@ -6088,7 +6088,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v4i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 48 @@ -8883,7 +8883,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] @@ -8924,7 +8924,7 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v4i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -8972,7 +8972,7 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v4i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 93fab7dff253b..8498e9af46f2b 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -122,8 +122,8 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GCN-IR-LABEL: s_test_srem: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -442,8 +442,8 @@ define i64 @v_test_srem(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -477,8 +477,8 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -519,8 +519,8 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -554,8 +554,8 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -650,14 +650,14 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: s_abs_i32 s8, s2 +; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 +; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -691,14 +691,14 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-IR-NEXT: s_abs_i32 s8, s2 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 +; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -739,14 +739,14 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: s_abs_i32 s8, s2 +; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 +; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -780,14 +780,14 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-IR-NEXT: s_abs_i32 s8, s2 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 +; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -829,18 +829,18 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-NEXT: s_load_dword s0, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_abs_i32 s8, s2 +; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s2, 0, s8 +; GCN-NEXT: s_sub_i32 s0, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_abs_i32 s2, s3 @@ -868,18 +868,18 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_abs_i32 s8, s2 +; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_abs_i32 s2, s3 @@ -915,8 +915,8 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem33_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1049,8 +1049,8 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem33_64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 @@ -1153,8 +1153,8 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_srem24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1189,22 +1189,22 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[4:5], 24 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[4:5], 16 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[2:3], 16 -; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[0:1], 16 +; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31 +; GCN-IR-NEXT: s_mov_b32 s1, s0 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 16 -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s4, s4, s2 -; GCN-IR-NEXT: s_subb_u32 s5, s5, s2 +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s4, s4, s0 +; GCN-IR-NEXT: s_subb_u32 s5, s5, s0 ; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] @@ -1271,20 +1271,20 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; GCN-IR-NEXT: s_mul_i32 s0, s6, s11 +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GCN-IR-NEXT: s_mul_i32 s2, s6, s11 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s7, s10 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s6, s10 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: s_mul_i32 s2, s7, s10 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: s_mul_i32 s2, s6, s10 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 ; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 ; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v1, s2, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v0, s3, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 -; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s0, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, s1, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 +; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 ; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s14, -1 ; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v0, v2, vcc @@ -1302,7 +1302,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1411,7 +1411,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_srem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s8, s3, 31 @@ -1984,7 +1984,7 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -2016,7 +2016,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_srem24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -2054,7 +2054,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -2085,7 +2085,7 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_srem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index 418c160d4244a..03d1dddd7b606 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: lshr_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -64,7 +64,7 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -83,7 +83,7 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -124,7 +124,7 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -146,7 +146,7 @@ define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -194,7 +194,7 @@ define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -212,7 +212,7 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: lshr_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -258,7 +258,7 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -284,7 +284,7 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; VI-NEXT: s_mov_b32 s19, 0xf000 @@ -370,8 +370,8 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: s_lshr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x14 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0x14 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -382,8 +382,8 @@ define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_lshr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x50 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x50 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -411,7 +411,7 @@ define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_lshr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -428,7 +428,7 @@ define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index 9ad9fa0304865..132775d81ca1a 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -10,12 +10,12 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr addrspace(1) %input, ptr addrspace(1) %output, i32 %i) { ; MUBUF-LABEL: kernel_background_evaluate: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_load_dword s0, s[0:1], 0x24 +; MUBUF-NEXT: s_load_dword s0, s[2:3], 0x24 ; MUBUF-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; MUBUF-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; MUBUF-NEXT: s_mov_b32 s38, -1 ; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 -; MUBUF-NEXT: s_add_u32 s36, s36, s3 +; MUBUF-NEXT: s_add_u32 s36, s36, s9 ; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 @@ -48,12 +48,12 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; FLATSCR-LABEL: kernel_background_evaluate: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 s2, s2, s5 +; FLATSCR-NEXT: s_add_u32 s6, s6, s11 ; FLATSCR-NEXT: s_movk_i32 s32, 0x6000 -; FLATSCR-NEXT: s_addc_u32 s3, s3, 0 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; FLATSCR-NEXT: s_load_dword s2, s[0:1], 0x24 +; FLATSCR-NEXT: s_addc_u32 s7, s7, 0 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 0 @@ -81,7 +81,7 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; MUBUF11-LABEL: kernel_background_evaluate: ; MUBUF11: ; %bb.0: ; %entry -; MUBUF11-NEXT: s_load_b32 s2, s[0:1], 0x24 +; MUBUF11-NEXT: s_load_b32 s2, s[2:3], 0x24 ; MUBUF11-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 ; MUBUF11-NEXT: v_mov_b32_e32 v4, 0x400000 @@ -108,7 +108,7 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; FLATSCR11-LABEL: kernel_background_evaluate: ; FLATSCR11: ; %bb.0: ; %entry -; FLATSCR11-NEXT: s_load_b32 s2, s[0:1], 0x24 +; FLATSCR11-NEXT: s_load_b32 s2, s[2:3], 0x24 ; FLATSCR11-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 ; FLATSCR11-NEXT: v_mov_b32_e32 v4, 0x400000 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll index 5c6f0019f1ed9..6ddf0986755f9 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; VI-LABEL: max_alignment_128: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_add_u32 s0, s0, s17 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -20,23 +20,23 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; VI-NEXT: .amdhsa_kernel max_alignment_128 ; VI-NEXT: .amdhsa_group_segment_fixed_size 0 ; VI-NEXT: .amdhsa_private_segment_fixed_size 256 -; VI-NEXT: .amdhsa_kernarg_size 0 -; VI-NEXT: .amdhsa_user_sgpr_count 6 +; VI-NEXT: .amdhsa_kernarg_size 56 +; VI-NEXT: .amdhsa_user_sgpr_count 14 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 8 +; VI-NEXT: .amdhsa_next_free_sgpr 18 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; ; GFX9-LABEL: max_alignment_128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -71,23 +71,23 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; GFX9-NEXT: .amdhsa_kernel max_alignment_128 ; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 ; GFX9-NEXT: .amdhsa_private_segment_fixed_size 256 -; GFX9-NEXT: .amdhsa_kernarg_size 0 -; GFX9-NEXT: .amdhsa_user_sgpr_count 6 +; GFX9-NEXT: .amdhsa_kernarg_size 56 +; GFX9-NEXT: .amdhsa_user_sgpr_count 14 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 8 +; GFX9-NEXT: .amdhsa_next_free_sgpr 18 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 @@ -117,7 +117,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { define amdgpu_kernel void @stackrealign_attr() #1 { ; VI-LABEL: stackrealign_attr: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_add_u32 s0, s0, s17 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -131,23 +131,23 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; VI-NEXT: .amdhsa_kernel stackrealign_attr ; VI-NEXT: .amdhsa_group_segment_fixed_size 0 ; VI-NEXT: .amdhsa_private_segment_fixed_size 12 -; VI-NEXT: .amdhsa_kernarg_size 0 -; VI-NEXT: .amdhsa_user_sgpr_count 6 +; VI-NEXT: .amdhsa_kernarg_size 56 +; VI-NEXT: .amdhsa_user_sgpr_count 14 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 8 +; VI-NEXT: .amdhsa_next_free_sgpr 18 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 @@ -168,7 +168,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; ; GFX9-LABEL: stackrealign_attr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -182,23 +182,23 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; GFX9-NEXT: .amdhsa_kernel stackrealign_attr ; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 ; GFX9-NEXT: .amdhsa_private_segment_fixed_size 12 -; GFX9-NEXT: .amdhsa_kernarg_size 0 -; GFX9-NEXT: .amdhsa_user_sgpr_count 6 +; GFX9-NEXT: .amdhsa_kernarg_size 56 +; GFX9-NEXT: .amdhsa_user_sgpr_count 14 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 8 +; GFX9-NEXT: .amdhsa_next_free_sgpr 18 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 @@ -228,7 +228,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { define amdgpu_kernel void @alignstack_attr() #2 { ; VI-LABEL: alignstack_attr: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_add_u32 s0, s0, s17 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -242,23 +242,23 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; VI-NEXT: .amdhsa_kernel alignstack_attr ; VI-NEXT: .amdhsa_group_segment_fixed_size 0 ; VI-NEXT: .amdhsa_private_segment_fixed_size 128 -; VI-NEXT: .amdhsa_kernarg_size 0 -; VI-NEXT: .amdhsa_user_sgpr_count 6 +; VI-NEXT: .amdhsa_kernarg_size 56 +; VI-NEXT: .amdhsa_user_sgpr_count 14 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 8 +; VI-NEXT: .amdhsa_next_free_sgpr 18 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 @@ -279,7 +279,7 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; ; GFX9-LABEL: alignstack_attr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -293,23 +293,23 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; GFX9-NEXT: .amdhsa_kernel alignstack_attr ; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 ; GFX9-NEXT: .amdhsa_private_segment_fixed_size 128 -; GFX9-NEXT: .amdhsa_kernarg_size 0 -; GFX9-NEXT: .amdhsa_user_sgpr_count 6 +; GFX9-NEXT: .amdhsa_kernarg_size 56 +; GFX9-NEXT: .amdhsa_user_sgpr_count 14 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 8 +; GFX9-NEXT: .amdhsa_next_free_sgpr 18 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index c6a599094fe43..3c16cd29de8f6 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -121,31 +121,31 @@ define amdgpu_kernel void @kernel_store_stacksave() { define amdgpu_kernel void @kernel_store_stacksave_nocall() { ; WAVE32-OPT-LABEL: kernel_store_stacksave_nocall: ; WAVE32-OPT: ; %bb.0: -; WAVE32-OPT-NEXT: s_getpc_b64 s[4:5] -; WAVE32-OPT-NEXT: s_mov_b32 s4, s0 +; WAVE32-OPT-NEXT: s_getpc_b64 s[12:13] +; WAVE32-OPT-NEXT: s_mov_b32 s12, s0 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v0, 0 -; WAVE32-OPT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; WAVE32-OPT-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-OPT-NEXT: s_bitset0_b32 s7, 21 -; WAVE32-OPT-NEXT: s_add_u32 s4, s4, s1 -; WAVE32-OPT-NEXT: s_addc_u32 s5, s5, 0 +; WAVE32-OPT-NEXT: s_bitset0_b32 s15, 21 +; WAVE32-OPT-NEXT: s_add_u32 s12, s12, s9 +; WAVE32-OPT-NEXT: s_addc_u32 s13, s13, 0 ; WAVE32-OPT-NEXT: s_lshr_b32 s0, s32, 5 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, s0 -; WAVE32-OPT-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; WAVE32-OPT-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; WAVE32-OPT-NEXT: s_endpgm ; ; WAVE64-OPT-LABEL: kernel_store_stacksave_nocall: ; WAVE64-OPT: ; %bb.0: -; WAVE64-OPT-NEXT: s_getpc_b64 s[4:5] -; WAVE64-OPT-NEXT: s_mov_b32 s4, s0 +; WAVE64-OPT-NEXT: s_getpc_b64 s[12:13] +; WAVE64-OPT-NEXT: s_mov_b32 s12, s0 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 0 -; WAVE64-OPT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; WAVE64-OPT-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-OPT-NEXT: s_add_u32 s4, s4, s1 -; WAVE64-OPT-NEXT: s_addc_u32 s5, s5, 0 +; WAVE64-OPT-NEXT: s_add_u32 s12, s12, s9 +; WAVE64-OPT-NEXT: s_addc_u32 s13, s13, 0 ; WAVE64-OPT-NEXT: s_lshr_b32 s0, s32, 6 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, s0 -; WAVE64-OPT-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; WAVE64-OPT-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; WAVE64-OPT-NEXT: s_endpgm ; ; WAVE32-O0-LABEL: kernel_store_stacksave_nocall: @@ -803,7 +803,7 @@ define amdgpu_gfx void @func_stacksave_sgpr(ptr addrspace(5) inreg %stack) { define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; WAVE32-OPT-LABEL: kernel_stacksave_sgpr: ; WAVE32-OPT: ; %bb.0: -; WAVE32-OPT-NEXT: s_load_dword s0, s[0:1], 0x0 +; WAVE32-OPT-NEXT: s_load_dword s0, s[2:3], 0x0 ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-OPT-NEXT: ;;#ASMSTART ; WAVE32-OPT-NEXT: ; use s0 @@ -812,7 +812,7 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; ; WAVE64-OPT-LABEL: kernel_stacksave_sgpr: ; WAVE64-OPT: ; %bb.0: -; WAVE64-OPT-NEXT: s_load_dword s0, s[0:1], 0x0 +; WAVE64-OPT-NEXT: s_load_dword s0, s[2:3], 0x0 ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-OPT-NEXT: ;;#ASMSTART ; WAVE64-OPT-NEXT: ; use s0 @@ -862,54 +862,72 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-OPT-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-OPT: ; %bb.0: -; WAVE32-OPT-NEXT: s_getpc_b64 s[8:9] -; WAVE32-OPT-NEXT: s_mov_b32 s8, s0 +; WAVE32-OPT-NEXT: s_getpc_b64 s[20:21] +; WAVE32-OPT-NEXT: s_mov_b32 s20, s0 +; WAVE32-OPT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; WAVE32-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 +; WAVE32-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; WAVE32-OPT-NEXT: s_movk_i32 s32, 0x1200 -; WAVE32-OPT-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; WAVE32-OPT-NEXT: s_mov_b32 s0, s32 -; WAVE32-OPT-NEXT: v_mov_b32_e32 v0, 42 -; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, 17 -; WAVE32-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi -; WAVE32-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo +; WAVE32-OPT-NEXT: s_mov_b64 s[10:11], s[4:5] +; WAVE32-OPT-NEXT: s_mov_b32 s4, s32 +; WAVE32-OPT-NEXT: v_mov_b32_e32 v3, 42 +; WAVE32-OPT-NEXT: v_mov_b32_e32 v4, 17 +; WAVE32-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 +; WAVE32-OPT-NEXT: s_mov_b32 s14, s8 +; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi +; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo +; WAVE32-OPT-NEXT: s_mov_b32 s12, s6 +; WAVE32-OPT-NEXT: s_mov_b32 s13, s7 ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-OPT-NEXT: s_bitset0_b32 s11, 21 -; WAVE32-OPT-NEXT: s_add_u32 s8, s8, s1 -; WAVE32-OPT-NEXT: s_addc_u32 s9, s9, 0 -; WAVE32-OPT-NEXT: s_lshr_b32 s6, s0, 5 -; WAVE32-OPT-NEXT: s_mov_b64 s[0:1], s[8:9] -; WAVE32-OPT-NEXT: s_mov_b64 s[2:3], s[10:11] -; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; WAVE32-OPT-NEXT: s_bitset0_b32 s23, 21 +; WAVE32-OPT-NEXT: s_add_u32 s20, s20, s9 +; WAVE32-OPT-NEXT: s_addc_u32 s21, s21, 0 +; WAVE32-OPT-NEXT: s_lshr_b32 s15, s4, 5 +; WAVE32-OPT-NEXT: s_mov_b64 s[4:5], s[0:1] +; WAVE32-OPT-NEXT: s_mov_b64 s[8:9], s[2:3] +; WAVE32-OPT-NEXT: s_mov_b64 s[0:1], s[20:21] +; WAVE32-OPT-NEXT: s_mov_b64 s[2:3], s[22:23] +; WAVE32-OPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[8:11], s32 offset:4 -; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] +; WAVE32-OPT-NEXT: buffer_store_dword v4, off, s[20:23], s32 offset:4 +; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-OPT-NEXT: ;;#ASMSTART -; WAVE32-OPT-NEXT: ; use s6 +; WAVE32-OPT-NEXT: ; use s15 ; WAVE32-OPT-NEXT: ;;#ASMEND ; WAVE32-OPT-NEXT: s_endpgm ; ; WAVE64-OPT-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE64-OPT: ; %bb.0: -; WAVE64-OPT-NEXT: s_getpc_b64 s[8:9] -; WAVE64-OPT-NEXT: s_mov_b32 s8, s0 +; WAVE64-OPT-NEXT: s_getpc_b64 s[20:21] +; WAVE64-OPT-NEXT: s_mov_b32 s20, s0 +; WAVE64-OPT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; WAVE64-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 +; WAVE64-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; WAVE64-OPT-NEXT: s_movk_i32 s32, 0x2400 -; WAVE64-OPT-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; WAVE64-OPT-NEXT: s_mov_b32 s0, s32 -; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 42 -; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, 17 -; WAVE64-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi -; WAVE64-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo +; WAVE64-OPT-NEXT: s_mov_b64 s[10:11], s[4:5] +; WAVE64-OPT-NEXT: s_mov_b32 s4, s32 +; WAVE64-OPT-NEXT: v_mov_b32_e32 v3, 42 +; WAVE64-OPT-NEXT: v_mov_b32_e32 v4, 17 +; WAVE64-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 +; WAVE64-OPT-NEXT: s_mov_b32 s14, s8 +; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi +; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo +; WAVE64-OPT-NEXT: s_mov_b32 s12, s6 +; WAVE64-OPT-NEXT: s_mov_b32 s13, s7 ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-OPT-NEXT: s_add_u32 s8, s8, s1 -; WAVE64-OPT-NEXT: s_addc_u32 s9, s9, 0 -; WAVE64-OPT-NEXT: s_lshr_b32 s6, s0, 6 -; WAVE64-OPT-NEXT: s_mov_b64 s[0:1], s[8:9] -; WAVE64-OPT-NEXT: s_mov_b64 s[2:3], s[10:11] -; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; WAVE64-OPT-NEXT: s_add_u32 s20, s20, s9 +; WAVE64-OPT-NEXT: s_addc_u32 s21, s21, 0 +; WAVE64-OPT-NEXT: s_lshr_b32 s15, s4, 6 +; WAVE64-OPT-NEXT: s_mov_b64 s[4:5], s[0:1] +; WAVE64-OPT-NEXT: s_mov_b64 s[8:9], s[2:3] +; WAVE64-OPT-NEXT: s_mov_b64 s[0:1], s[20:21] +; WAVE64-OPT-NEXT: s_mov_b64 s[2:3], s[22:23] +; WAVE64-OPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[8:11], s32 offset:4 -; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] +; WAVE64-OPT-NEXT: buffer_store_dword v4, off, s[20:23], s32 offset:4 +; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-OPT-NEXT: ;;#ASMSTART -; WAVE64-OPT-NEXT: ; use s6 +; WAVE64-OPT-NEXT: ; use s15 ; WAVE64-OPT-NEXT: ;;#ASMEND ; WAVE64-OPT-NEXT: s_endpgm ; @@ -1274,70 +1292,70 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-OPT-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-OPT: ; %bb.0: ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-OPT-NEXT: s_mov_b32 s8, s33 +; WAVE32-OPT-NEXT: s_mov_b32 s20, s33 ; WAVE32-OPT-NEXT: s_mov_b32 s33, s32 -; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1 -; WAVE32-OPT-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-OPT-NEXT: v_writelane_b32 v31, s30, 0 +; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s16, -1 +; WAVE32-OPT-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s16 +; WAVE32-OPT-NEXT: v_writelane_b32 v32, s30, 0 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, 17 ; WAVE32-OPT-NEXT: s_addk_i32 s32, 0x1200 -; WAVE32-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi -; WAVE32-OPT-NEXT: s_mov_b32 s6, s32 -; WAVE32-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo -; WAVE32-OPT-NEXT: v_writelane_b32 v31, s31, 1 -; WAVE32-OPT-NEXT: s_lshr_b32 s7, s6, 5 +; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi +; WAVE32-OPT-NEXT: s_mov_b32 s18, s32 +; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo +; WAVE32-OPT-NEXT: v_writelane_b32 v32, s31, 1 +; WAVE32-OPT-NEXT: s_lshr_b32 s19, s18, 5 ; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] +; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-OPT-NEXT: ;;#ASMSTART -; WAVE32-OPT-NEXT: ; use s7 +; WAVE32-OPT-NEXT: ; use s19 ; WAVE32-OPT-NEXT: ;;#ASMEND -; WAVE32-OPT-NEXT: s_mov_b32 s32, s6 -; WAVE32-OPT-NEXT: v_readlane_b32 s31, v31, 1 -; WAVE32-OPT-NEXT: v_readlane_b32 s30, v31, 0 +; WAVE32-OPT-NEXT: s_mov_b32 s32, s18 +; WAVE32-OPT-NEXT: v_readlane_b32 s31, v32, 1 +; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0 ; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1 -; WAVE32-OPT-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; WAVE32-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-OPT-NEXT: s_addk_i32 s32, 0xee00 -; WAVE32-OPT-NEXT: s_mov_b32 s33, s8 +; WAVE32-OPT-NEXT: s_mov_b32 s33, s20 ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) ; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31] ; ; WAVE64-OPT-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE64-OPT: ; %bb.0: ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE64-OPT-NEXT: s_mov_b32 s8, s33 +; WAVE64-OPT-NEXT: s_mov_b32 s20, s33 ; WAVE64-OPT-NEXT: s_mov_b32 s33, s32 -; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; WAVE64-OPT-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; WAVE64-OPT-NEXT: s_mov_b64 exec, s[4:5] -; WAVE64-OPT-NEXT: v_writelane_b32 v31, s30, 0 +; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; WAVE64-OPT-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE64-OPT-NEXT: s_mov_b64 exec, s[16:17] +; WAVE64-OPT-NEXT: v_writelane_b32 v32, s30, 0 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 42 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, 17 ; WAVE64-OPT-NEXT: s_addk_i32 s32, 0x2400 -; WAVE64-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi -; WAVE64-OPT-NEXT: s_mov_b32 s6, s32 -; WAVE64-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo -; WAVE64-OPT-NEXT: v_writelane_b32 v31, s31, 1 -; WAVE64-OPT-NEXT: s_lshr_b32 s7, s6, 6 +; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi +; WAVE64-OPT-NEXT: s_mov_b32 s18, s32 +; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo +; WAVE64-OPT-NEXT: v_writelane_b32 v32, s31, 1 +; WAVE64-OPT-NEXT: s_lshr_b32 s19, s18, 6 ; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] +; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-OPT-NEXT: ;;#ASMSTART -; WAVE64-OPT-NEXT: ; use s7 +; WAVE64-OPT-NEXT: ; use s19 ; WAVE64-OPT-NEXT: ;;#ASMEND -; WAVE64-OPT-NEXT: s_mov_b32 s32, s6 -; WAVE64-OPT-NEXT: v_readlane_b32 s31, v31, 1 -; WAVE64-OPT-NEXT: v_readlane_b32 s30, v31, 0 +; WAVE64-OPT-NEXT: s_mov_b32 s32, s18 +; WAVE64-OPT-NEXT: v_readlane_b32 s31, v32, 1 +; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0 ; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; WAVE64-OPT-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; WAVE64-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE64-OPT-NEXT: s_mov_b64 exec, s[4:5] ; WAVE64-OPT-NEXT: s_addk_i32 s32, 0xdc00 -; WAVE64-OPT-NEXT: s_mov_b32 s33, s8 +; WAVE64-OPT-NEXT: s_mov_b32 s33, s20 ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) ; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index 01ad966597139..f7eb760fda084 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -8,10 +8,10 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -21,8 +21,8 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; ; GFX7-LABEL: store_lds_v4i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s4 @@ -35,8 +35,8 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; ; GFX6-LABEL: store_lds_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 @@ -50,10 +50,10 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX10-LABEL: store_lds_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 @@ -64,8 +64,8 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX11-LABEL: store_lds_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 @@ -79,10 +79,10 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 @@ -123,8 +123,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -176,8 +176,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -230,10 +230,10 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: s_lshr_b32 s3, s6, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 @@ -275,8 +275,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_lshr_b32 s4, s3, 8 @@ -317,10 +317,10 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 @@ -337,8 +337,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -366,8 +366,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -396,10 +396,10 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 @@ -417,8 +417,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -439,10 +439,10 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 @@ -453,8 +453,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -468,8 +468,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -484,10 +484,10 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 @@ -499,8 +499,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 @@ -515,10 +515,10 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -528,8 +528,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s4 @@ -542,8 +542,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -557,12 +557,12 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 @@ -571,8 +571,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -587,10 +587,10 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -600,8 +600,8 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; ; GFX7-LABEL: store_lds_v4i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s4 @@ -614,8 +614,8 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; ; GFX6-LABEL: store_lds_v4i32_align16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 @@ -629,10 +629,10 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX10-LABEL: store_lds_v4i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 @@ -643,8 +643,8 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX11-LABEL: store_lds_v4i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll index 507b411996d97..64ce67a1a3dee 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -8,20 +8,20 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -33,8 +33,8 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) ; ; GFX6-LABEL: store_lds_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -48,21 +48,21 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) ; GFX10-LABEL: store_lds_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 @@ -75,10 +75,10 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 @@ -110,8 +110,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -152,8 +152,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -195,10 +195,10 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 @@ -231,8 +231,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 @@ -265,10 +265,10 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 @@ -282,8 +282,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -306,8 +306,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -331,10 +331,10 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 @@ -349,8 +349,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -368,10 +368,10 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 @@ -381,8 +381,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -395,8 +395,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -410,10 +410,10 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 @@ -424,8 +424,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 @@ -439,10 +439,10 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -452,8 +452,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 @@ -466,8 +466,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -481,10 +481,10 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -495,8 +495,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -510,20 +510,20 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -535,8 +535,8 @@ define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i ; ; GFX6-LABEL: store_lds_v3i32_align16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -550,21 +550,21 @@ define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i ; GFX10-LABEL: store_lds_v3i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index f88aaf389ca9a..3644bef9c20a1 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -50,12 +50,12 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s4, 14 +; HAWAII-NEXT: s_or_b32 s0, s6, 14 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: v_mov_b32_e32 v1, s7 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v1, s2 @@ -70,12 +70,12 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s4, 14 +; FIJI-NEXT: s_or_b32 s0, s6, 14 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: v_mov_b32_e32 v1, s7 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s3, s1, 0xffff @@ -94,9 +94,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -114,9 +114,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s3, s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, s2 @@ -133,16 +133,16 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX11-LABEL: local_store_i55: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[0:1] offset:14 +; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[2:3] offset:14 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s1, s3, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s3 -; GFX11-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-NEXT: s_and_b32 s3, s1, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_mov_b32_e32 v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 +; GFX11-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 ; GFX11-NEXT: ds_store_b8_d16_hi v1, v0 offset:6 @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v0, s2 @@ -169,8 +169,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, s2 @@ -182,8 +182,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -195,8 +195,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX10-LABEL: local_store_i48: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -208,10 +208,10 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX11-LABEL: local_store_i48: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:4 ; GFX11-NEXT: ds_store_b32 v0, v2 @@ -223,9 +223,9 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x4 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x4 +; HAWAII-NEXT: s_load_dword s3, s[6:7], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: s_and_b32 s2, s2, 1 @@ -239,9 +239,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x10 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[6:7], 0x10 +; FIJI-NEXT: s_load_dword s3, s[6:7], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s2, s2, 1 @@ -255,9 +255,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 @@ -271,9 +271,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX10-LABEL: local_store_i65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 @@ -287,13 +287,13 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX11-LABEL: local_store_i65: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 1 +; GFX11-NEXT: s_and_b32 s2, s4, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: ds_store_b8 v2, v3 offset:8 ; GFX11-NEXT: ds_store_b64 v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index ded308ae4f230..6044873563254 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: s_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX8-LABEL: s_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX9-LABEL: s_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_i32 s2, s2, s3 @@ -43,7 +43,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX12-LABEL: s_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -60,8 +60,8 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: s_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,10 +72,10 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX8-LABEL: s_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s2 +; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -84,18 +84,18 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX9-LABEL: s_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s0, 0x4d2, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_sub_i32 s2, 0x4d2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -112,7 +112,7 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -144,7 +144,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -155,7 +155,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: test_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -194,7 +194,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: test_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -208,7 +208,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: test_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -219,7 +219,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: test_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -238,7 +238,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -257,7 +257,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -272,7 +272,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -284,7 +284,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -306,7 +306,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -328,7 +328,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -350,7 +350,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 @@ -365,7 +365,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 @@ -391,7 +391,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -412,7 +412,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -432,7 +432,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -446,9 +446,11 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: test_sub_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -472,7 +474,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -497,7 +499,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -515,7 +517,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -527,9 +529,11 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -551,7 +555,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -583,7 +587,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -604,7 +608,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -617,9 +621,11 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -642,8 +648,8 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { ; GFX6-LABEL: s_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -656,8 +662,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX8-LABEL: s_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_u32 s2, s4, s6 ; GFX8-NEXT: s_subb_u32 s3, s5, s7 @@ -670,22 +676,22 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX9-LABEL: s_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s4, s6 -; GFX9-NEXT: s_subb_u32 s1, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_sub_u32 s2, s4, s6 +; GFX9-NEXT: s_subb_u32 s3, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -702,8 +708,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind { ; GFX6-LABEL: v_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -725,8 +731,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: v_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -747,12 +753,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: v_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 @@ -763,10 +769,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX12-LABEL: v_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7] @@ -791,8 +799,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -816,8 +824,8 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -840,12 +848,12 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -858,10 +866,12 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7] @@ -888,8 +898,8 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -921,8 +931,8 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -961,14 +971,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v4i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] ; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:16 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -987,10 +997,12 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 6ec213a06999b..fe234a82ba6f7 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -8,13 +8,13 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -24,8 +24,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: v_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -67,8 +67,10 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: v_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -96,25 +98,25 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; GFX9-LABEL: s_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_pk_sub_i16 v0, s9, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -137,23 +139,23 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: s_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 +; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -175,7 +177,7 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; GCN-LABEL: s_test_sub_self_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -185,7 +187,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_sub_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 @@ -195,7 +197,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_sub_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -214,7 +216,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -227,7 +229,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: s_test_sub_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -246,7 +248,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -258,7 +260,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -277,7 +279,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -291,7 +293,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_test_sub_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -310,7 +312,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_sub_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -324,7 +326,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_sub_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -349,7 +353,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -363,7 +367,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -382,7 +386,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_sub_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -396,7 +400,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_sub_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -420,7 +426,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -433,7 +439,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -452,7 +458,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -466,7 +472,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -490,7 +498,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -503,7 +511,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -521,7 +529,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -535,7 +543,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -560,7 +570,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -573,7 +583,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -591,7 +601,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -605,7 +615,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -630,13 +642,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -648,8 +660,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -672,13 +684,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -692,8 +704,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -726,14 +740,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -746,8 +760,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -772,13 +786,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -794,8 +808,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -829,13 +845,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -847,8 +863,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -873,13 +889,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -893,8 +909,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -927,12 +945,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -947,8 +965,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -974,13 +992,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 @@ -997,8 +1015,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index 873567c3ab6f4..d4329aec2021c 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -7,7 +7,7 @@ target triple="amdgcn--" ; NOTE: breaking large PHIs is disabled here else this example is completely optimized out ; before reaching codegen. -define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) nounwind { +define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) #1 { ; CHECK-LABEL: foobar: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 @@ -59,3 +59,4 @@ ife: declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 attributes #0 = { nounwind readnone } +attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll index 1be420eccb353..19d633651fdd0 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll @@ -4,7 +4,7 @@ ; TODO: Update to check for granulated sgpr count directive once one is added. -define amdgpu_kernel void @kern() { +define amdgpu_kernel void @kern() #0 { ; ASM-LABEL: kern: ; ASM: .amdhsa_next_free_sgpr 5 ; ASM: .amdhsa_reserve_xnack_mask 1 @@ -23,5 +23,7 @@ entry: ret void } +attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll index acdcd16a1f9ef..2097579e0c995 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll @@ -4,7 +4,7 @@ ; TODO: Update to check for granulated sgpr count directive once one is added. -define amdgpu_kernel void @kern() { +define amdgpu_kernel void @kern() #0 { ; ASM-LABEL: kern: ; ASM: .amdhsa_next_free_sgpr 5 ; ASM: .amdhsa_reserve_xnack_mask 0 @@ -23,5 +23,7 @@ entry: ret void } +attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll index 0aac07342db84..775c62e73261a 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll @@ -4,7 +4,7 @@ ; TODO: Update to check for granulated sgpr count directive once one is added. -define amdgpu_kernel void @kern() { +define amdgpu_kernel void @kern() #0 { ; ASM-LABEL: kern: ; ASM: .amdhsa_next_free_sgpr 5 ; ASM: .amdhsa_reserve_xnack_mask 1 @@ -23,5 +23,7 @@ entry: ret void } +attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 7dce633e9186a..52370f6a2ef05 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -12,7 +12,7 @@ declare void @llvm.debugtrap() #1 define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-LABEL: trap: ; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -22,9 +22,9 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-TRAP-GFX803-LABEL: trap: ; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3 @@ -34,7 +34,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-TRAP-GFX900-LABEL: trap: ; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -44,7 +44,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-NOTRAP-GFX900-LABEL: trap: ; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -54,7 +54,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-TRAP-GFX1100-LABEL: trap: ; HSA-TRAP-GFX1100: ; %bb.0: -; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 ; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) @@ -103,7 +103,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { ; NOHSA-TRAP-GFX900-LABEL: non_entry_trap: ; NOHSA-TRAP-GFX900: ; %bb.0: ; %entry -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -120,7 +120,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; ; HSA-TRAP-GFX803-LABEL: non_entry_trap: ; HSA-TRAP-GFX803: ; %bb.0: ; %entry -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -136,12 +136,12 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) ; HSA-TRAP-GFX803-NEXT: s_endpgm ; HSA-TRAP-GFX803-NEXT: .LBB1_2: ; %trap -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: s_trap 2 ; ; HSA-TRAP-GFX900-LABEL: non_entry_trap: ; HSA-TRAP-GFX900: ; %bb.0: ; %entry -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -158,7 +158,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; ; HSA-NOTRAP-GFX900-LABEL: non_entry_trap: ; HSA-NOTRAP-GFX900: ; %bb.0: ; %entry -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -175,7 +175,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; ; HSA-TRAP-GFX1100-LABEL: non_entry_trap: ; HSA-TRAP-GFX1100: ; %bb.0: ; %entry -; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -267,7 +267,7 @@ ret: define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { ; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after: ; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -281,8 +281,8 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX803-LABEL: trap_with_use_after: ; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] -; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5 @@ -297,7 +297,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX900-LABEL: trap_with_use_after: ; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -309,7 +309,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-NOTRAP-GFX900-LABEL: trap_with_use_after: ; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -323,7 +323,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX1100-LABEL: trap_with_use_after: ; HSA-TRAP-GFX1100: ; %bb.0: -; HSA-TRAP-GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -403,7 +403,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-LABEL: debugtrap: ; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 @@ -416,7 +416,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-TRAP-GFX803-LABEL: debugtrap: ; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -431,7 +431,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-TRAP-GFX900-LABEL: debugtrap: ; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 @@ -445,7 +445,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-NOTRAP-GFX900-LABEL: debugtrap: ; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 @@ -458,7 +458,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-TRAP-GFX1100-LABEL: debugtrap: ; HSA-TRAP-GFX1100: ; %bb.0: -; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v2, 2 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll index 2f687295af73e..9bab3e6fcf8c4 100644 --- a/llvm/test/CodeGen/AMDGPU/trap.ll +++ b/llvm/test/CodeGen/AMDGPU/trap.ll @@ -31,14 +31,14 @@ declare void @llvm.debugtrap() #1 ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 208 +; MESA-TRAP-NEXT: .long 5080 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 144 +; NOMESA-TRAP-NEXT: .long 5016 ; GCN-LABEL: {{^}}hsa_trap: -; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP: s_trap 2 ; HSA-TRAP: COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 @@ -59,11 +59,11 @@ define amdgpu_kernel void @hsa_trap(ptr addrspace(1) nocapture readonly %arg0) { ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 204 +; MESA-TRAP-NEXT: .long 5080 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 140 +; NOMESA-TRAP-NEXT: .long 5016 ; GCN-LABEL: {{^}}hsa_debugtrap: ; HSA-TRAP: s_trap 3 @@ -102,7 +102,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; NO-TRAP-BIT: enable_trap_handler = 0 ; HSA-TRAP: BB{{[0-9]_[0-9]+}}: ; %trap -; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-NEXT: s_trap 2 define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { entry: @@ -124,7 +124,7 @@ ret: ; NO-TRAP-BIT: enable_trap_handler = 0 ; HSA-TRAP: BB{{[0-9]_[0-9]+}}: ; %trap -; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-NEXT: s_trap 2 define amdgpu_kernel void @non_entry_trap_no_unreachable(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { entry: diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index c0c56ebb16610..22eb7dddb84f4 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -85,8 +85,8 @@ define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) { define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture readonly %arg1, ptr addrspace(1) nocapture %arg2) local_unnamed_addr { ; SI-LABEL: truncate_high_elt_extract_vector: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -103,8 +103,8 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc ; ; VI-LABEL: truncate_high_elt_extract_vector: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[4:5], 0x0 ; VI-NEXT: s_load_dword s3, s[6:7], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll index 931953e230bb2..efb1a630f927c 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll @@ -5,58 +5,58 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, <16 x i32> %in) { ; SI-LABEL: truncstore_arg_v16i32_to_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: s_mov_b32 s22, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s13, s13, 8 -; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_and_b32 s1, s18, 0xff +; SI-NEXT: s_lshl_b32 s0, s19, 24 +; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: s_lshl_b32 s1, s17, 8 +; SI-NEXT: s_and_b32 s2, s16, 0xff +; SI-NEXT: s_or_b32 s1, s2, s1 +; SI-NEXT: s_and_b32 s1, s1, 0xffff +; SI-NEXT: s_and_b32 s2, s14, 0xff +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: s_lshl_b32 s1, s15, 24 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_or_b32 s1, s1, s2 +; SI-NEXT: s_lshl_b32 s2, s13, 8 +; SI-NEXT: s_and_b32 s3, s12, 0xff +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_and_b32 s2, s2, 0xffff +; SI-NEXT: s_and_b32 s3, s10, 0xff +; SI-NEXT: s_or_b32 s1, s2, s1 +; SI-NEXT: s_lshl_b32 s2, s11, 24 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_or_b32 s2, s2, s3 +; SI-NEXT: s_lshl_b32 s3, s9, 8 ; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_or_b32 s3, s8, s3 +; SI-NEXT: s_and_b32 s3, s3, 0xffff ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_lshl_b32 s15, s15, 24 -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: s_lshl_b32 s11, s11, 24 -; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_lshl_b32 s7, s7, 24 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s3, s7, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s3, s3, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_mov_b32_e32 v3, s0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: truncstore_arg_v16i32_to_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s17 ; VI-NEXT: v_mov_b32_e32 v1, s16 @@ -98,9 +98,9 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, <16 x i64> %in) { ; SI-LABEL: truncstore_arg_v16i64_to_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[16:31], s[0:1], 0x39 -; SI-NEXT: s_load_dwordx2 s[36:37], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x29 +; SI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x39 +; SI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x29 ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s38, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,9 +149,9 @@ define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, ; ; VI-LABEL: truncstore_arg_v16i64_to_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[16:31], s[0:1], 0xe4 -; VI-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0xa4 +; VI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0xe4 +; VI-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s26 ; VI-NEXT: v_mov_b32_e32 v1, s24 diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index a9cd0e997e0e5..88bdf6454fe52 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @trunc_i64_to_i32_store(ptr addrspace(1) %out, [8 x i32], i64 %in) { ; GCN-LABEL: {{^}}trunc_i64_to_i32_store: -; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], +; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[2:3], ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] ; SI: buffer_store_dword [[VLOAD]] ; VI: flat_store_dword v[{{[0-9:]+}}], [[VLOAD]] diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index 416dbb226422c..03a1b3598024b 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_uaddo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; VI-LABEL: s_uaddo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s6, s0 @@ -46,14 +46,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; GFX9-LABEL: s_uaddo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s7, s3 +; GFX9-NEXT: s_addc_u32 s1, s7, s1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -75,8 +75,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_uaddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -95,8 +95,8 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_uaddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -111,12 +111,12 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_uaddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -132,7 +132,7 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -210,7 +210,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i32_novcc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_uaddo_i32_novcc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -268,7 +268,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_uaddo_i32_novcc: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -301,7 +301,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; SI-LABEL: s_uaddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -325,7 +325,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_uaddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -345,7 +345,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_uaddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s6, s4, s6 @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -454,7 +454,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -486,7 +486,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -508,7 +508,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -537,7 +537,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_uaddo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -568,7 +568,7 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_uaddo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -591,7 +591,7 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_uaddo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -618,45 +618,45 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_uaddo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; SI-NEXT: s_cmp_eq_u32 s2, s3 -; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 +; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_cmp_eq_u32 s2, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 -; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_cmp_eq_u32 s0, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[2:3], vcc, -1 +; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; VI-NEXT: .LBB8_2: ; %exit -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 @@ -668,19 +668,19 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: s_uaddo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_cmp_eq_u32 s2, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 ; GFX9-NEXT: .LBB8_2: ; %exit -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: global_store_byte v1, v2, s[6:7] @@ -706,7 +706,7 @@ exit: define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s14, s2 @@ -740,7 +740,7 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_uaddo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -767,7 +767,7 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_uaddo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index f686aad0cefc2..dfd9a650ff0e9 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -44,7 +44,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -80,7 +80,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GCN-LABEL: udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -112,7 +112,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX1030-LABEL: udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -185,7 +185,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: s_udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -218,7 +218,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; VI-LABEL: s_udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -251,7 +251,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX1030-LABEL: s_udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX1030-NEXT: s_sub_i32 s5, 0, s3 @@ -346,7 +346,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -456,7 +456,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -507,7 +507,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v2i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -619,7 +619,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s6, s10 @@ -714,7 +714,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s6, s10 @@ -809,7 +809,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 16 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -904,7 +904,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v4i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x1 @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_pow2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1116,7 +1116,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: udiv_i32_div_pow2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1134,7 +1134,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1148,7 +1148,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GFX1030-LABEL: udiv_i32_div_pow2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_even: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1203,7 +1203,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: udiv_i32_div_k_even: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GCN-LABEL: udiv_i32_div_k_even: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GFX1030-LABEL: udiv_i32_div_k_even: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1277,7 +1277,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_odd: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: udiv_i32_div_k_odd: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GCN-LABEL: udiv_i32_div_k_odd: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GFX1030-LABEL: udiv_i32_div_k_odd: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_udiv_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1429,7 +1429,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GCN-LABEL: v_udiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1452,7 +1452,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX1030-LABEL: v_udiv_i8: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1540,7 +1540,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1569,7 +1569,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i16: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1651,7 +1651,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1688,7 +1688,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1725,7 +1725,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1770,7 +1770,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i23: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -1848,7 +1848,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i24: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1885,7 +1885,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i24: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1922,7 +1922,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1967,7 +1967,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i24: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -2048,7 +2048,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { ; SI-LABEL: scalarize_mulhu_4xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2076,7 +2076,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; VI-LABEL: scalarize_mulhu_4xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GCN-LABEL: scalarize_mulhu_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2130,7 +2130,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GFX1030-LABEL: scalarize_mulhu_4xi32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -2193,7 +2193,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read define amdgpu_kernel void @test_udiv2(i32 %p) { ; SI-LABEL: test_udiv2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; VI-LABEL: test_udiv2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2217,7 +2217,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GCN-LABEL: test_udiv2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2227,7 +2227,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GFX1030-LABEL: test_udiv2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; SI-LABEL: test_udiv_3_mulhu: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; VI-LABEL: test_udiv_3_mulhu: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -2279,7 +2279,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GCN-LABEL: test_udiv_3_mulhu: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2290,7 +2290,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GFX1030-LABEL: test_udiv_3_mulhu: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 84906ac1f27ba..78f85569f849d 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -123,8 +123,8 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; ; GCN-IR-LABEL: s_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -398,8 +398,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -423,8 +423,8 @@ define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_udiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -497,17 +497,17 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv32_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_load_dword s8, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s2, 0, s8 +; GCN-NEXT: s_sub_i32 s0, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 @@ -533,17 +533,17 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv32_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s8, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 @@ -576,18 +576,18 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-NEXT: s_load_dword s0, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s8, s2, 1 +; GCN-NEXT: s_lshr_b32 s8, s0, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s2, 0, s8 +; GCN-NEXT: s_sub_i32 s0, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 1 @@ -614,18 +614,18 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s8, s2, 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s0, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 @@ -659,8 +659,8 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv23_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -684,8 +684,8 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv23_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -716,12 +716,14 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_udiv24_i48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s0, s2, 0xff000000 -; GCN-NEXT: s_and_b32 s1, s3, 0xffff +; GCN-NEXT: s_and_b32 s0, s0, 0xff000000 +; GCN-NEXT: s_and_b32 s1, s1, 0xffff ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 @@ -732,20 +734,18 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_sub_u32 s8, 0, s0 ; GCN-NEXT: s_subb_u32 s9, 0, s1 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 ; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 ; GCN-NEXT: v_mul_lo_u32 v6, s8, v1 -; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 @@ -831,20 +831,20 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s3, s5, 0xffff -; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s1, s5, 0xffff +; GCN-IR-NEXT: s_and_b32 s0, s4, 0xff000000 ; GCN-IR-NEXT: s_and_b32 s5, s7, 0xffff ; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000 -; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 -; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24 +; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[0:1], 24 +; GCN-IR-NEXT: s_lshr_b64 s[0:1], s[4:5], 24 ; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 +; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[0:1], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[2:3] +; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[0:1] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[8:9] ; GCN-IR-NEXT: s_sub_u32 s12, s10, s16 @@ -869,8 +869,8 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s14 -; GCN-IR-NEXT: s_add_u32 s14, s2, -1 -; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 +; GCN-IR-NEXT: s_add_u32 s14, s0, -1 +; GCN-IR-NEXT: s_addc_u32 s15, s1, -1 ; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] ; GCN-IR-NEXT: s_add_u32 s8, s4, s16 ; GCN-IR-NEXT: s_addc_u32 s9, s5, 0 @@ -888,7 +888,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_and_b32 s4, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[2:3] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 @@ -898,10 +898,10 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow4 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] +; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[0:1] ; GCN-IR-NEXT: .LBB7_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 @@ -920,7 +920,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_udiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] @@ -1364,7 +1364,7 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NEXT: s_add_u32 s1, 0, 0xaaaa0000 ; GCN-NEXT: v_not_b32_e32 v0, 23 ; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 @@ -1443,7 +1443,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_udiv_k_den_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] ; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 @@ -1661,7 +1661,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 8 @@ -1682,7 +1682,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_udiv24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 @@ -1709,7 +1709,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1731,7 +1731,7 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_udiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index f0f0b6680e0e6..1468c7b99b5c2 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -36,22 +36,22 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; ; GFX6-LABEL: test_udivrem: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s8, s[0:1], 0x26 -; GFX6-NEXT: s_load_dword s9, s[0:1], 0x1d -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dword s8, s[2:3], 0x26 +; GFX6-NEXT: s_load_dword s9, s[2:3], 0x1d ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s2, 0, s8 -; GFX6-NEXT: s_mov_b32 s3, s7 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 -; GFX6-NEXT: s_mov_b32 s2, s6 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 @@ -69,33 +69,34 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_cselect_b32 s8, s10, s9 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x98 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x98 +; GFX8-NEXT: s_load_dword s5, s[2:3], 0x74 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX8-NEXT: s_sub_i32 s2, 0, s4 +; GFX8-NEXT: s_sub_i32 s0, 0, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 ; GFX8-NEXT: s_mul_i32 s0, s0, s4 ; GFX8-NEXT: s_sub_i32 s0, s5, s0 @@ -163,33 +164,33 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: s_sub_i32 s0, 0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s6 -; GFX6-NEXT: s_sub_i32 s2, s4, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s6 -; GFX6-NEXT: s_cmp_ge_u32 s2, s6 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s6 -; GFX6-NEXT: s_cmp_ge_u32 s2, s6 -; GFX6-NEXT: s_cselect_b32 s4, s3, s2 -; GFX6-NEXT: s_sub_i32 s2, 0, s7 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s6 +; GFX6-NEXT: s_sub_i32 s0, s4, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s6 +; GFX6-NEXT: s_cmp_ge_u32 s0, s6 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s6 +; GFX6-NEXT: s_cmp_ge_u32 s0, s6 +; GFX6-NEXT: s_cselect_b32 s4, s1, s0 +; GFX6-NEXT: s_sub_i32 s0, 0, s7 +; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 @@ -206,44 +207,46 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX6-NEXT: s_cselect_b32 s5, s6, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX8-NEXT: s_sub_i32 s2, 0, s6 +; GFX8-NEXT: s_sub_i32 s0, 0, s6 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_mul_i32 s0, s0, s6 +; GFX8-NEXT: s_sub_i32 s0, s4, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s6 +; GFX8-NEXT: s_cmp_ge_u32 s0, s6 +; GFX8-NEXT: s_cselect_b32 s0, s1, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s6 +; GFX8-NEXT: s_cmp_ge_u32 s0, s6 +; GFX8-NEXT: s_cselect_b32 s4, s1, s0 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 +; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: s_mul_i32 s2, s2, s6 -; GFX8-NEXT: s_sub_i32 s2, s4, s2 -; GFX8-NEXT: s_sub_i32 s3, s2, s6 -; GFX8-NEXT: s_cmp_ge_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_sub_i32 s3, s2, s6 -; GFX8-NEXT: s_cmp_ge_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_sub_i32 s3, 0, s7 -; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: s_mul_i32 s2, s2, s7 ; GFX8-NEXT: s_sub_i32 s2, s5, s2 @@ -332,34 +335,36 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s2, 0, s8 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s8 -; GFX6-NEXT: s_sub_i32 s2, s4, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s8 -; GFX6-NEXT: s_cmp_ge_u32 s2, s8 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s8 -; GFX6-NEXT: s_cmp_ge_u32 s2, s8 -; GFX6-NEXT: s_cselect_b32 s4, s3, s2 -; GFX6-NEXT: s_sub_i32 s2, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s8 +; GFX6-NEXT: s_sub_i32 s0, s4, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 @@ -367,87 +372,82 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s9 -; GFX6-NEXT: s_sub_i32 s2, s5, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s9 -; GFX6-NEXT: s_cmp_ge_u32 s2, s9 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s9 -; GFX6-NEXT: s_cmp_ge_u32 s2, s9 -; GFX6-NEXT: s_cselect_b32 s5, s3, s2 -; GFX6-NEXT: s_sub_i32 s2, 0, s10 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_mul_i32 s1, s1, s9 +; GFX6-NEXT: s_sub_i32 s1, s5, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s9 +; GFX6-NEXT: s_cmp_ge_u32 s1, s9 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, s1, s9 +; GFX6-NEXT: s_cmp_ge_u32 s1, s9 +; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_sub_i32 s4, 0, s10 +; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s10 -; GFX6-NEXT: s_sub_i32 s2, s6, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s10 -; GFX6-NEXT: s_cmp_ge_u32 s2, s10 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 -; GFX6-NEXT: s_sub_i32 s3, s2, s10 -; GFX6-NEXT: s_cmp_ge_u32 s2, s10 -; GFX6-NEXT: s_cselect_b32 s6, s3, s2 -; GFX6-NEXT: s_sub_i32 s2, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_mul_i32 s4, s4, s10 +; GFX6-NEXT: s_sub_i32 s4, s6, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s10 +; GFX6-NEXT: s_cmp_ge_u32 s4, s10 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s10 +; GFX6-NEXT: s_cmp_ge_u32 s4, s10 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, 0, s11 +; GFX6-NEXT: v_mul_lo_u32 v0, s5, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_readfirstlane_b32 s4, v2 -; GFX6-NEXT: s_mul_i32 s4, s4, s11 -; GFX6-NEXT: s_sub_i32 s4, s7, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s11 -; GFX6-NEXT: s_cmp_ge_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s11 -; GFX6-NEXT: s_cmp_ge_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v2 +; GFX6-NEXT: s_mul_i32 s0, s0, s11 +; GFX6-NEXT: s_sub_i32 s0, s7, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s11 +; GFX6-NEXT: s_cmp_ge_u32 s0, s11 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s11 +; GFX6-NEXT: s_cmp_ge_u32 s0, s11 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s2, 0, s8 +; GFX8-NEXT: s_sub_i32 s0, 0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s9 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: s_mul_i32 s2, s2, s8 -; GFX8-NEXT: s_sub_i32 s2, s4, s2 -; GFX8-NEXT: s_sub_i32 s3, s2, s8 -; GFX8-NEXT: s_cmp_ge_u32 s2, s8 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_sub_i32 s3, s2, s8 -; GFX8-NEXT: s_cmp_ge_u32 s2, s8 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_sub_i32 s3, 0, s9 -; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_mul_i32 s0, s0, s8 +; GFX8-NEXT: s_sub_i32 s0, s4, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s8 +; GFX8-NEXT: s_cmp_ge_u32 s0, s8 +; GFX8-NEXT: s_cselect_b32 s0, s1, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s8 +; GFX8-NEXT: s_cmp_ge_u32 s0, s8 +; GFX8-NEXT: s_cselect_b32 s4, s1, s0 +; GFX8-NEXT: s_sub_i32 s0, 0, s9 +; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 @@ -455,40 +455,44 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX8-NEXT: v_readfirstlane_b32 s3, v0 -; GFX8-NEXT: s_mul_i32 s3, s3, s9 -; GFX8-NEXT: s_sub_i32 s3, s5, s3 -; GFX8-NEXT: s_sub_i32 s4, s3, s9 -; GFX8-NEXT: s_cmp_ge_u32 s3, s9 -; GFX8-NEXT: s_cselect_b32 s3, s4, s3 -; GFX8-NEXT: s_sub_i32 s4, s3, s9 -; GFX8-NEXT: s_cmp_ge_u32 s3, s9 -; GFX8-NEXT: s_cselect_b32 s3, s4, s3 -; GFX8-NEXT: s_sub_i32 s4, 0, s10 -; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_mul_i32 s0, s0, s9 +; GFX8-NEXT: s_sub_i32 s0, s5, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s9 +; GFX8-NEXT: s_cmp_ge_u32 s0, s9 +; GFX8-NEXT: s_cselect_b32 s0, s1, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s9 +; GFX8-NEXT: s_cmp_ge_u32 s0, s9 +; GFX8-NEXT: s_cselect_b32 s5, s1, s0 +; GFX8-NEXT: s_sub_i32 s0, 0, s10 +; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mul_i32 s4, s4, s10 -; GFX8-NEXT: s_sub_i32 s4, s6, s4 -; GFX8-NEXT: s_sub_i32 s5, s4, s10 -; GFX8-NEXT: s_cmp_ge_u32 s4, s10 -; GFX8-NEXT: s_cselect_b32 s4, s5, s4 -; GFX8-NEXT: s_sub_i32 s5, s4, s10 -; GFX8-NEXT: s_cmp_ge_u32 s4, s10 -; GFX8-NEXT: s_cselect_b32 s4, s5, s4 -; GFX8-NEXT: s_sub_i32 s5, 0, s11 -; GFX8-NEXT: v_mul_lo_u32 v0, s5, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_mul_i32 s0, s0, s10 +; GFX8-NEXT: s_sub_i32 s0, s6, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s10 +; GFX8-NEXT: s_cmp_ge_u32 s0, s10 +; GFX8-NEXT: s_cselect_b32 s0, s1, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s10 +; GFX8-NEXT: s_cmp_ge_u32 s0, s10 +; GFX8-NEXT: s_cselect_b32 s6, s1, s0 +; GFX8-NEXT: s_sub_i32 s0, 0, s11 +; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v3, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_readfirstlane_b32 s2, v3 ; GFX8-NEXT: s_mul_i32 s2, s2, s11 ; GFX8-NEXT: s_sub_i32 s2, s7, s2 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index ba52d702c7ed1..d00ea6dff2447 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -26,7 +26,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -53,7 +53,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -66,7 +66,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; ; VI-LABEL: s_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -84,8 +84,8 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 x i64> %in) { ; SI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -102,12 +102,12 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; VI-NEXT: v_ldexp_f64 v[4:5], v[2:3], 32 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -126,8 +126,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) { ; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s11 ; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s9 @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; ; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s15 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s13 @@ -194,8 +194,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_uint_to_fp_i32_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; SI-NEXT: v_mov_b32_e32 v3, s1 @@ -205,8 +205,8 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; ; VI-LABEL: s_uint_to_fp_i32_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -221,7 +221,7 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -237,8 +237,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 x i32> %in) { ; SI-LABEL: s_uint_to_fp_v4i32_to_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -257,8 +257,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; ; VI-LABEL: s_uint_to_fp_v4i32_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -284,8 +284,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: uint_to_fp_i1_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -298,8 +298,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: uint_to_fp_i1_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -318,8 +318,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %in) { ; SI-LABEL: uint_to_fp_i1_to_f64_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s2, 0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -332,8 +332,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; ; VI-LABEL: uint_to_fp_i1_to_f64_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -351,8 +351,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) { ; SI-LABEL: s_uint_to_fp_i8_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xff ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -363,8 +363,8 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; ; VI-LABEL: s_uint_to_fp_i8_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -400,8 +400,8 @@ define double @v_uint_to_fp_i8_to_f64(i8 %in) { define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_select_uint_to_fp_i1_vals_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -414,8 +414,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_uint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -451,8 +451,8 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_select_uint_to_fp_i1_vals_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -465,8 +465,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_uint_to_fp_i1_vals_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -503,8 +503,8 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -517,8 +517,8 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; ; VI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll index 79b0a966bc1fb..3d0fc4e6281a6 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_uint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_uint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s4, s3 ; GFX8-NEXT: s_min_u32 s4, s4, 32 @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_uint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s4, s3 @@ -75,7 +75,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_uint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -126,26 +126,27 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_uint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_clz_i32_u32_e32 v3, v1 ; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -161,7 +162,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_uint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -180,7 +181,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_uint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s4, s3 ; GFX8-NEXT: s_min_u32 s4, s4, 32 @@ -197,7 +198,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_uint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s4, s3 @@ -224,7 +225,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -248,7 +249,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_uint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -273,24 +274,26 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_uint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2 +; GFX11-NEXT: v_clz_i32_u32_e32 v3, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 +; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -306,8 +309,8 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -332,8 +335,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s2, s7 ; GFX8-NEXT: s_flbit_i32_b32 s3, s5 @@ -359,8 +362,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s2, s7 @@ -392,7 +395,7 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -443,7 +446,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -496,51 +499,53 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4 -; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2 +; GFX11-NEXT: v_clz_i32_u32_e32 v9, v3 +; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v11, v8 -; GFX11-NEXT: v_clz_i32_u32_e32 v12, v6 +; GFX11-NEXT: v_clz_i32_u32_e32 v11, v7 +; GFX11-NEXT: v_clz_i32_u32_e32 v12, v5 ; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX11-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] -; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] -; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 -; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 -; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 -; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 -; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 -; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_u32_e32 v6, v2 -; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 4, v0 -; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 -; GFX11-NEXT: v_ldexp_f32 v2, v1, v10 -; GFX11-NEXT: v_ldexp_f32 v1, v6, v11 -; GFX11-NEXT: v_ldexp_f32 v0, v4, v5 -; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX11-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v8 +; GFX11-NEXT: v_ldexp_f32 v3, v2, v9 +; GFX11-NEXT: v_ldexp_f32 v2, v0, v10 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 +; GFX11-NEXT: v_ldexp_f32 v0, v5, v4 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -556,8 +561,8 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -586,8 +591,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s2, s7 ; GFX8-NEXT: s_flbit_i32_b32 s3, s5 @@ -616,8 +621,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s2, s7 @@ -654,7 +659,7 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -713,7 +718,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -772,59 +777,61 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4 -; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2 +; GFX11-NEXT: v_clz_i32_u32_e32 v9, v3 +; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v11, v8 -; GFX11-NEXT: v_clz_i32_u32_e32 v12, v6 +; GFX11-NEXT: v_clz_i32_u32_e32 v11, v7 +; GFX11-NEXT: v_clz_i32_u32_e32 v12, v5 ; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX11-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] -; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] -; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 -; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 -; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 -; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 -; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 -; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v8 +; GFX11-NEXT: v_ldexp_f32 v2, v2, v9 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v10 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v10 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_ldexp_f32 v2, v2, v11 -; GFX11-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v1 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3 -; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2 +; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2 +; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index 5f8d0f665a953..c21ae434f4470 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; SI-LABEL: uitofp_i16_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; ; VI-LABEL: uitofp_i16_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; ; GFX11-LABEL: uitofp_i16_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @uitofp_i32_to_f16( ; SI-LABEL: uitofp_i32_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; ; VI-LABEL: uitofp_i32_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; ; GFX11-LABEL: uitofp_i32_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ entry: define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; SI-LABEL: uitofp_v2i16_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -168,7 +168,7 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; ; VI-LABEL: uitofp_v2i16_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -188,7 +188,7 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; ; GFX11-LABEL: uitofp_v2i16_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -221,7 +221,7 @@ entry: define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; SI-LABEL: uitofp_v2i32_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -244,7 +244,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; ; VI-LABEL: uitofp_v2i32_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -266,7 +266,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; ; GFX11-LABEL: uitofp_v2i32_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -301,19 +301,21 @@ entry: define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_uint_to_fp_i1_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -321,26 +323,26 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uint_to_fp_i1_to_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -348,16 +350,14 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_uint_to_fp_i1_to_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll index f60a274f1e592..a3fc6ded0a004 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -5,36 +5,36 @@ define amdgpu_kernel void @uniform_if_scc(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s2, 0 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_cmp_eq_u32 s0, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_cbranch_scc1 .LBB0_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB0_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_scc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s2, 0 -; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 0 +; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_cbranch_scc1 .LBB0_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB0_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -57,38 +57,38 @@ done: define amdgpu_kernel void @uniform_if_vcc(float %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_vcc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s3, s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], s3, 0 +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], s1, 0 ; SI-NEXT: s_and_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB1_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_vcc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s3, s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_load_dword s1, s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], s3, 0 +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], s1, 0 ; VI-NEXT: s_and_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB1_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB1_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -111,36 +111,36 @@ done: define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_swap_br_targets_scc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s2, 0 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_cbranch_scc1 .LBB2_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB2_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_swap_br_targets_scc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 -; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_cbranch_scc1 .LBB2_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB2_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -163,38 +163,38 @@ done: define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_swap_br_targets_vcc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s3, s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], s3, 0 +; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], s1, 0 ; SI-NEXT: s_and_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB3_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB3_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_swap_br_targets_vcc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s3, s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_load_dword s1, s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], s3, 0 +; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], s1, 0 ; VI-NEXT: s_and_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB3_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB3_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -219,14 +219,14 @@ done: define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a) { ; SI-LABEL: uniform_if_move_valu: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-NEXT: v_add_f32_e32 v0, s0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 ; SI-NEXT: s_cbranch_vccnz .LBB4_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -237,14 +237,14 @@ define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a) ; ; VI-LABEL: uniform_if_move_valu: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e32 v0, s2, v0 +; VI-NEXT: v_add_f32_e32 v0, s0, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 ; VI-NEXT: s_cbranch_vccnz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -271,14 +271,14 @@ endif: define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, float %a) { ; SI-LABEL: uniform_if_move_valu_commute: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-NEXT: v_add_f32_e32 v0, s0, v0 ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0 ; SI-NEXT: s_cbranch_vccnz .LBB5_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -289,14 +289,14 @@ define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, f ; ; VI-LABEL: uniform_if_move_valu_commute: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e32 v0, s2, v0 +; VI-NEXT: v_add_f32_e32 v0, s0, v0 ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0 ; VI-NEXT: s_cbranch_vccnz .LBB5_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -322,38 +322,36 @@ endif: define amdgpu_kernel void @uniform_if_else_ret(ptr addrspace(1) nocapture %out, i32 %a) { ; SI-LABEL: uniform_if_else_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB6_2 ; SI-NEXT: ; %bb.1: ; %if.else -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB6_2: ; %if.then -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_else_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cbranch_scc0 .LBB6_2 ; VI-NEXT: ; %bb.1: ; %if.else -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB6_2: ; %if.then -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -376,8 +374,8 @@ if.end: ; preds = %if.else, %if.then define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr addrspace(1) nocapture %out1, i32 %a) { ; SI-LABEL: uniform_if_else: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -403,8 +401,8 @@ define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr ; ; VI-LABEL: uniform_if_else: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -446,17 +444,17 @@ if.end: ; preds = %if.else, %if.then define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: icmp_2_users: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dword s4, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_gt_i32 s4, 0 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_cmp_lt_i32 s4, 1 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %IF -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: .LBB8_2: ; %ENDIF @@ -464,17 +462,17 @@ define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) { ; ; VI-LABEL: icmp_2_users: ; VI: ; %bb.0: ; %main_body -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_gt_i32 s4, 0 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: s_cmp_lt_i32 s4, 1 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %IF -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: .LBB8_2: ; %ENDIF @@ -495,20 +493,20 @@ ENDIF: ; preds = %IF, %main_body define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, ptr addrspace(1) %out) { ; SI-LABEL: icmp_users_different_blocks: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_i32 s2, 1 +; SI-NEXT: s_cmp_lt_i32 s0, 1 ; SI-NEXT: s_cbranch_scc1 .LBB9_2 ; SI-NEXT: ; %bb.1: ; %bb2 -; SI-NEXT: s_cmp_gt_i32 s3, 0 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cmp_gt_i32 s1, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[0:1] ; SI-NEXT: s_cbranch_vccz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %bb9 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB9_3: ; %bb7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -518,20 +516,20 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p ; ; VI-LABEL: icmp_users_different_blocks: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lt_i32 s2, 1 +; VI-NEXT: s_cmp_lt_i32 s0, 1 ; VI-NEXT: s_cbranch_scc1 .LBB9_2 ; VI-NEXT: ; %bb.1: ; %bb2 -; VI-NEXT: s_cmp_gt_i32 s3, 0 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cmp_gt_i32 s1, 0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[0:1] ; VI-NEXT: s_cbranch_vccz .LBB9_3 ; VI-NEXT: .LBB9_2: ; %bb9 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB9_3: ; %bb7 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -560,7 +558,7 @@ bb9: ; preds = %bb8, %bb4 define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: uniform_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: .LBB10_1: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -572,7 +570,7 @@ define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { ; ; VI-LABEL: uniform_loop: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-NEXT: .LBB10_1: ; %loop ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -600,11 +598,11 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; SI-LABEL: uniform_inside_divergent: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; SI-NEXT: s_cbranch_execz .LBB11_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -623,11 +621,11 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; VI-LABEL: uniform_inside_divergent: ; VI: ; %bb.0: ; %entry ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; VI-NEXT: s_cbranch_execz .LBB11_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -662,14 +660,14 @@ endif: define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: divergent_inside_uniform: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB12_2 ; SI-NEXT: .LBB12_1: ; %endif ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB12_2: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -685,14 +683,14 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 % ; ; VI-LABEL: divergent_inside_uniform: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB12_2 ; VI-NEXT: .LBB12_1: ; %endif ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB12_2: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -726,9 +724,9 @@ endif: define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: divergent_if_uniform_if: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; SI-NEXT: s_cbranch_execz .LBB13_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -737,8 +735,8 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: .LBB13_2: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 @@ -754,9 +752,9 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; ; VI-LABEL: divergent_if_uniform_if: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; VI-NEXT: s_cbranch_execz .LBB13_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: s_mov_b32 s7, 0xf000 @@ -765,8 +763,8 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: .LBB13_2: ; %endif -; VI-NEXT: s_or_b64 exec, exec, s[2:3] -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 @@ -807,12 +805,12 @@ exit: define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: cse_uniform_condition_different_blocks: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_i32 s2, 1 +; SI-NEXT: s_cmp_lt_i32 s0, 1 ; SI-NEXT: s_cbranch_scc1 .LBB14_2 ; SI-NEXT: ; %bb.1: ; %bb2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -827,12 +825,12 @@ define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr ; ; VI-LABEL: cse_uniform_condition_different_blocks: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lt_i32 s2, 1 +; VI-NEXT: s_cmp_lt_i32 s0, 1 ; VI-NEXT: s_cbranch_scc1 .LBB14_2 ; VI-NEXT: ; %bb.1: ; %bb2 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -867,7 +865,7 @@ bb9: ; preds = %bb8, %bb4 define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_eq_u64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -886,7 +884,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %ou ; ; VI-LABEL: uniform_if_scc_i64_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u64 s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -921,7 +919,7 @@ done: define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_ne: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -940,7 +938,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %ou ; ; VI-LABEL: uniform_if_scc_i64_ne: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -975,7 +973,7 @@ done: define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_sgt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -994,7 +992,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %o ; ; VI-LABEL: uniform_if_scc_i64_sgt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -1031,17 +1029,17 @@ define amdgpu_kernel void @move_to_valu_i64_eq(ptr addrspace(1) %out) { ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; SI-NEXT: s_cbranch_vccnz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB18_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1050,17 +1048,17 @@ define amdgpu_kernel void @move_to_valu_i64_eq(ptr addrspace(1) %out) { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_read_b64 v[0:1], v0 -; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; VI-NEXT: s_cbranch_vccnz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB18_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1085,17 +1083,17 @@ define amdgpu_kernel void @move_to_valu_i64_ne(ptr addrspace(1) %out) { ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; SI-NEXT: s_cbranch_vccnz .LBB19_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s2, 1 +; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB19_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1104,17 +1102,17 @@ define amdgpu_kernel void @move_to_valu_i64_ne(ptr addrspace(1) %out) { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_read_b64 v[0:1], v0 -; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; VI-NEXT: s_cbranch_vccnz .LBB19_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB19_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll index 0cb408676552e..18b2397bbd5a7 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX90A-LABEL: test_insert_extract: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX90A-NEXT: s_mov_b32 s2, 0 ; GFX90A-NEXT: s_and_b64 vcc, exec, -1 ; GFX90A-NEXT: s_mov_b32 s3, 0 @@ -55,7 +55,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX940-LABEL: test_insert_extract: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX940-NEXT: s_mov_b32 s2, 0 ; GFX940-NEXT: s_and_b64 vcc, exec, -1 ; GFX940-NEXT: s_mov_b32 s3, 0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX1030-LABEL: test_insert_extract: ; GFX1030: ; %bb.0: ; %entry -; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX1030-NEXT: s_mov_b32 s2, 0 ; GFX1030-NEXT: s_mov_b32 s3, 0 ; GFX1030-NEXT: s_mov_b32 s4, 0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX1100-LABEL: test_insert_extract: ; GFX1100: ; %bb.0: ; %entry -; GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX1100-NEXT: s_mov_b32 s2, 0 ; GFX1100-NEXT: s_mov_b32 s3, 0 ; GFX1100-NEXT: s_mov_b32 s4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index c0c84d46b7356..63105453174eb 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -122,8 +122,8 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; ; GCN-IR-LABEL: s_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -413,18 +413,18 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-NEXT: s_load_dword s0, s[2:3], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s8, s2, 1 +; GCN-NEXT: s_lshr_b32 s8, s0, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s2, 0, s8 +; GCN-NEXT: s_sub_i32 s0, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 1 @@ -448,18 +448,18 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_urem31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s8, s2, 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s0, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 @@ -490,110 +490,112 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem31_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: s_sub_i32 s3, 0, s2 -; GCN-NEXT: s_lshr_b32 s4, s11, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_lshr_b32 s0, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: s_sub_i32 s1, 0, s0 +; GCN-NEXT: s_lshr_b32 s4, s5, 1 +; GCN-NEXT: s_lshr_b32 s8, s7, 1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 -; GCN-NEXT: s_lshr_b32 s3, s5, 1 -; GCN-NEXT: s_lshr_b32 s5, s7, 1 +; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 +; GCN-NEXT: s_lshr_b32 s1, s11, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: s_mul_i32 s6, s6, s2 -; GCN-NEXT: s_sub_i32 s3, s3, s6 -; GCN-NEXT: s_sub_i32 s6, s3, s2 -; GCN-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-NEXT: s_cselect_b32 s3, s6, s3 -; GCN-NEXT: s_sub_i32 s6, s3, s2 -; GCN-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-NEXT: s_cselect_b32 s6, s6, s3 -; GCN-NEXT: s_sub_i32 s2, 0, s4 -; GCN-NEXT: v_mul_lo_u32 v0, s2, v1 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-NEXT: s_mul_i32 s5, s5, s0 +; GCN-NEXT: s_sub_i32 s4, s4, s5 +; GCN-NEXT: s_sub_i32 s5, s4, s0 +; GCN-NEXT: s_cmp_ge_u32 s4, s0 +; GCN-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-NEXT: s_sub_i32 s5, s4, s0 +; GCN-NEXT: s_cmp_ge_u32 s4, s0 +; GCN-NEXT: s_cselect_b32 s0, s5, s4 +; GCN-NEXT: s_sub_i32 s4, 0, s1 +; GCN-NEXT: v_mul_lo_u32 v0, s4, v1 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s5, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_readfirstlane_b32 s6, v2 -; GCN-NEXT: s_mul_i32 s6, s6, s4 -; GCN-NEXT: s_sub_i32 s5, s5, s6 -; GCN-NEXT: s_sub_i32 s6, s5, s4 -; GCN-NEXT: s_cmp_ge_u32 s5, s4 -; GCN-NEXT: s_cselect_b32 s5, s6, s5 -; GCN-NEXT: s_sub_i32 s6, s5, s4 -; GCN-NEXT: s_cmp_ge_u32 s5, s4 -; GCN-NEXT: s_cselect_b32 s4, s6, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: v_readfirstlane_b32 s0, v2 +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: s_sub_i32 s0, s8, s0 +; GCN-NEXT: s_sub_i32 s2, s0, s1 +; GCN-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-NEXT: s_sub_i32 s2, s0, s1 +; GCN-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem31_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-IR-NEXT: s_sub_i32 s3, 0, s2 -; GCN-IR-NEXT: s_lshr_b32 s4, s11, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_lshr_b32 s0, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: s_sub_i32 s1, 0, s0 +; GCN-IR-NEXT: s_lshr_b32 s4, s5, 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s7, 1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s3, v0 -; GCN-IR-NEXT: s_lshr_b32 s3, s5, 1 -; GCN-IR-NEXT: s_lshr_b32 s5, s7, 1 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s1, v0 +; GCN-IR-NEXT: s_lshr_b32 s1, s11, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-IR-NEXT: s_mul_i32 s6, s6, s2 -; GCN-IR-NEXT: s_sub_i32 s3, s3, s6 -; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 -; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-IR-NEXT: s_cselect_b32 s3, s6, s3 -; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 -; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-IR-NEXT: s_cselect_b32 s6, s6, s3 -; GCN-IR-NEXT: s_sub_i32 s2, 0, s4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v1 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-IR-NEXT: s_mul_i32 s5, s5, s0 +; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 +; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 +; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 +; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 +; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 +; GCN-IR-NEXT: s_cselect_b32 s0, s5, s4 +; GCN-IR-NEXT: s_sub_i32 s4, 0, s1 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v1 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s5, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 -; GCN-IR-NEXT: s_mul_i32 s6, s6, s4 -; GCN-IR-NEXT: s_sub_i32 s5, s5, s6 -; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 -; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 -; GCN-IR-NEXT: s_cselect_b32 s5, s6, s5 -; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 -; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 -; GCN-IR-NEXT: s_cselect_b32 s4, s6, s5 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s0, v2 +; GCN-IR-NEXT: s_mul_i32 s0, s0, s1 +; GCN-IR-NEXT: s_sub_i32 s0, s8, s0 +; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, @@ -605,8 +607,8 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem24_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_lshr_b32 s4, s4, 8 @@ -630,8 +632,8 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_urem24_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_lshr_b32 s4, s4, 8 @@ -662,110 +664,112 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem23_64_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: s_sub_i32 s3, 0, s2 -; GCN-NEXT: s_lshr_b32 s4, s11, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_lshr_b32 s0, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: s_sub_i32 s1, 0, s0 +; GCN-NEXT: s_lshr_b32 s4, s5, 1 +; GCN-NEXT: s_lshr_b32 s8, s7, 9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 -; GCN-NEXT: s_lshr_b32 s3, s5, 1 -; GCN-NEXT: s_lshr_b32 s5, s7, 9 +; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 +; GCN-NEXT: s_lshr_b32 s1, s11, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: s_mul_i32 s6, s6, s2 -; GCN-NEXT: s_sub_i32 s3, s3, s6 -; GCN-NEXT: s_sub_i32 s6, s3, s2 -; GCN-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-NEXT: s_cselect_b32 s3, s6, s3 -; GCN-NEXT: s_sub_i32 s6, s3, s2 -; GCN-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-NEXT: s_cselect_b32 s6, s6, s3 -; GCN-NEXT: s_sub_i32 s2, 0, s4 -; GCN-NEXT: v_mul_lo_u32 v0, s2, v1 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-NEXT: s_mul_i32 s5, s5, s0 +; GCN-NEXT: s_sub_i32 s4, s4, s5 +; GCN-NEXT: s_sub_i32 s5, s4, s0 +; GCN-NEXT: s_cmp_ge_u32 s4, s0 +; GCN-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-NEXT: s_sub_i32 s5, s4, s0 +; GCN-NEXT: s_cmp_ge_u32 s4, s0 +; GCN-NEXT: s_cselect_b32 s0, s5, s4 +; GCN-NEXT: s_sub_i32 s4, 0, s1 +; GCN-NEXT: v_mul_lo_u32 v0, s4, v1 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s5, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_readfirstlane_b32 s6, v2 -; GCN-NEXT: s_mul_i32 s6, s6, s4 -; GCN-NEXT: s_sub_i32 s5, s5, s6 -; GCN-NEXT: s_sub_i32 s6, s5, s4 -; GCN-NEXT: s_cmp_ge_u32 s5, s4 -; GCN-NEXT: s_cselect_b32 s5, s6, s5 -; GCN-NEXT: s_sub_i32 s6, s5, s4 -; GCN-NEXT: s_cmp_ge_u32 s5, s4 -; GCN-NEXT: s_cselect_b32 s4, s6, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: v_readfirstlane_b32 s0, v2 +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: s_sub_i32 s0, s8, s0 +; GCN-NEXT: s_sub_i32 s2, s0, s1 +; GCN-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-NEXT: s_sub_i32 s2, s0, s1 +; GCN-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem23_64_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-IR-NEXT: s_sub_i32 s3, 0, s2 -; GCN-IR-NEXT: s_lshr_b32 s4, s11, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_lshr_b32 s0, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: s_sub_i32 s1, 0, s0 +; GCN-IR-NEXT: s_lshr_b32 s4, s5, 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s7, 9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s3, v0 -; GCN-IR-NEXT: s_lshr_b32 s3, s5, 1 -; GCN-IR-NEXT: s_lshr_b32 s5, s7, 9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s1, v0 +; GCN-IR-NEXT: s_lshr_b32 s1, s11, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-IR-NEXT: s_mul_i32 s6, s6, s2 -; GCN-IR-NEXT: s_sub_i32 s3, s3, s6 -; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 -; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-IR-NEXT: s_cselect_b32 s3, s6, s3 -; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 -; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-IR-NEXT: s_cselect_b32 s6, s6, s3 -; GCN-IR-NEXT: s_sub_i32 s2, 0, s4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v1 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-IR-NEXT: s_mul_i32 s5, s5, s0 +; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 +; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 +; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 +; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 +; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 +; GCN-IR-NEXT: s_cselect_b32 s0, s5, s4 +; GCN-IR-NEXT: s_sub_i32 s4, 0, s1 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v1 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s5, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 -; GCN-IR-NEXT: s_mul_i32 s6, s6, s4 -; GCN-IR-NEXT: s_sub_i32 s5, s5, s6 -; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 -; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 -; GCN-IR-NEXT: s_cselect_b32 s5, s6, s5 -; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 -; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 -; GCN-IR-NEXT: s_cselect_b32 s4, s6, s5 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s0, v2 +; GCN-IR-NEXT: s_mul_i32 s0, s0, s1 +; GCN-IR-NEXT: s_sub_i32 s0, s8, s0 +; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 +; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, @@ -777,7 +781,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -881,7 +885,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_urem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] @@ -961,7 +965,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_add_u32 s0, 0, 0xaaaa0000 ; GCN-NEXT: v_not_b32_e32 v0, 23 ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -977,7 +980,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v1, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 ; GCN-NEXT: s_mul_i32 s8, s1, s8 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_mul_hi_u32 v4, s1, v0 @@ -1000,8 +1003,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -1010,6 +1013,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_mov_b32_e32 v2, s7 @@ -1038,7 +1042,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_urem_k_den_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] ; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 @@ -1389,7 +1393,7 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0x41c00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s2, -1 @@ -1412,7 +1416,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_urem24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_mov_b32 s5, 0x41c00000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s2, -1 @@ -1441,7 +1445,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_movk_i32 s4, 0x5b7f ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -1466,7 +1470,7 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_urem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-IR-NEXT: s_movk_i32 s4, 0x5b7f ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index 666ae7c126ae3..dacc986205983 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_usubo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; VI-LABEL: s_usubo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_sub_u32 s0, s6, s0 @@ -47,14 +47,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; GFX9-LABEL: s_usubo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_sub_u32 s0, s6, s2 +; GFX9-NEXT: s_sub_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_subb_u32 s1, s7, s3 +; GFX9-NEXT: s_subb_u32 s1, s7, s1 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -75,8 +75,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_usubo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -95,8 +95,8 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_usubo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -111,12 +111,12 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_usubo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -132,7 +132,7 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -210,7 +210,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i32_novcc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_usubo_i32_novcc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -268,7 +268,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_usubo_i32_novcc: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -301,7 +301,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; SI-LABEL: s_usubo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -325,7 +325,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_usubo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, s4, s6 @@ -345,7 +345,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s6, s4, s6 @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -454,7 +454,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -486,7 +486,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -508,7 +508,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -537,7 +537,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_usubo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -568,7 +568,7 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_usubo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -591,7 +591,7 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_usubo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -618,45 +618,45 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_usubo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; SI-NEXT: s_cmp_eq_u32 s2, s3 -; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 +; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_cmp_eq_u32 s2, s3 -; VI-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_cmp_eq_u32 s0, s1 +; VI-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[2:3], vcc, -1 +; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; VI-NEXT: .LBB8_2: ; %exit -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 @@ -668,19 +668,19 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: s_usubo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_cmp_eq_u32 s2, s3 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 ; GFX9-NEXT: .LBB8_2: ; %exit -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: global_store_byte v1, v2, s[6:7] @@ -707,7 +707,7 @@ exit: define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s14, s2 @@ -741,7 +741,7 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_usubo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -768,7 +768,7 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_usubo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll index ca4d689156b49..2210b6c0d3c3a 100644 --- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll @@ -25,7 +25,7 @@ bb: define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { ; GFX9-LABEL: test_add_co_sdwa: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll index 2fa9750653b6d..4b9b5f9ffdf84 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll @@ -23,7 +23,7 @@ entry: define amdgpu_kernel void @fcmp_test(half %x, half %y) { ; CHECK-LABEL: fcmp_test: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_b32 s0, s[0:1], 0x0 +; CHECK-NEXT: s_load_b32 s0, s[2:3], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 @@ -46,7 +46,7 @@ entry: define amdgpu_kernel void @ballot_test(half %x, half %y) { ; CHECK-LABEL: ballot_test: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_b32 s0, s[0:1], 0x0 +; CHECK-NEXT: s_load_b32 s0, s[2:3], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index fc6df735c05b0..a8f3635416cff 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -13,37 +13,37 @@ declare double @llvm.fabs.f64(double) define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cnd_nan_nosgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_cmp_eq_u32 s8, 0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cnd_nan_nosgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s2, 0 +; VI-NEXT: s_cmp_eq_u32 s4, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc @@ -54,35 +54,37 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; ; GFX10-LABEL: v_cnd_nan_nosgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cnd_nan_nosgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s2, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -107,7 +109,7 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 { ; SI-LABEL: v_cnd_nan: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -122,7 +124,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; VI-LABEL: v_cnd_nan: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -135,7 +137,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX10-LABEL: v_cnd_nan: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s2, 0 @@ -146,7 +148,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX11-LABEL: v_cnd_nan: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s2, 0 @@ -169,30 +171,30 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -200,26 +202,27 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[2:3] ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -235,30 +238,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -266,24 +269,25 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1] -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -301,30 +305,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -332,26 +336,27 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[2:3] ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -367,30 +372,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -398,24 +403,25 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1] -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -433,16 +439,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -451,20 +457,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -472,32 +478,34 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -518,16 +526,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -536,20 +544,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -557,32 +565,34 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -603,8 +613,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -622,8 +632,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -642,9 +652,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -655,9 +666,12 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -681,8 +695,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -702,8 +716,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -727,13 +741,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc @@ -743,8 +757,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -773,8 +789,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -794,8 +810,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -819,13 +835,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc @@ -835,8 +851,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -865,8 +883,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -887,8 +905,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -913,13 +931,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc @@ -930,8 +948,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc @@ -961,8 +981,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -987,8 +1007,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1016,14 +1036,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1036,8 +1056,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1070,8 +1092,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1096,8 +1118,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1125,14 +1147,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1145,8 +1167,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1181,8 +1205,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1207,8 +1231,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1236,14 +1260,14 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1256,8 +1280,10 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1290,8 +1316,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1315,8 +1341,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1343,13 +1369,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v2, v1, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 @@ -1362,8 +1388,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc @@ -1398,8 +1426,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1423,8 +1451,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1451,14 +1479,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc @@ -1469,8 +1497,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1502,8 +1532,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1526,8 +1556,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1553,14 +1583,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1571,8 +1601,10 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1604,8 +1636,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1625,8 +1657,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; ; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1650,13 +1682,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc @@ -1666,8 +1698,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1697,8 +1731,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1722,8 +1756,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1751,13 +1785,13 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc @@ -1771,8 +1805,10 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1808,18 +1844,18 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 @@ -1827,22 +1863,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1855,15 +1891,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v0, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1871,21 +1907,23 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX10-NEXT: global_store_short v2, v0, s[2:3] +; GFX10-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1910,37 +1948,37 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_cmp_lg_u32 s8, 0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] @@ -1951,35 +1989,37 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] @@ -2001,18 +2041,18 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 @@ -2020,22 +2060,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 @@ -2049,15 +2089,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2066,21 +2106,23 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index f7933d719f989..472a443cf6dde 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -6,57 +6,57 @@ define amdgpu_kernel void @madak_f16( ; SI-LABEL: madak_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madak_f32 v0, v0, v1, 0x41200000 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: madak_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: madak_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -95,32 +95,32 @@ entry: define amdgpu_kernel void @madak_f16_use_2( ; SI-LABEL: madak_f16_use_2: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s18, s14 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s8 ; SI-NEXT: s_mov_b32 s17, s9 -; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s19, s15 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s14 +; SI-NEXT: s_mov_b32 s11, s15 +; SI-NEXT: s_mov_b32 s2, s14 +; SI-NEXT: s_mov_b32 s3, s15 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc +; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -128,49 +128,49 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-NEXT: v_mac_f32_e32 v3, v0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_short v1, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: madak_f16_use_2: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_mov_b32 s15, 0xf000 +; VI-NEXT: s_mov_b32 s14, -1 +; VI-NEXT: s_mov_b32 s18, s14 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s8 ; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s19, s15 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s14 +; VI-NEXT: s_mov_b32 s11, s15 +; VI-NEXT: s_mov_b32 s2, s14 +; VI-NEXT: s_mov_b32 s3, s15 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, 0x4900 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 ; VI-NEXT: v_mac_f16_e32 v3, v0, v2 -; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 -; VI-NEXT: buffer_store_short v3, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v1, off, s[12:15], 0 +; VI-NEXT: buffer_store_short v3, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: madak_f16_use_2: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 ; GFX11-NEXT: s_mov_b32 s14, -1 ; GFX11-NEXT: s_mov_b32 s15, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s14 diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 8bc8fbd0e0e84..9f6d27802e184 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -24,7 +24,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace ; ; GISEL-LABEL: v_pack_b32_v2f16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -56,7 +56,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32_v2f16_sub: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -73,7 +73,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; ; GISEL-LABEL: v_pack_b32_v2f16_sub: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -105,7 +105,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs define amdgpu_kernel void @fptrunc( ; GCN-LABEL: fptrunc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s7, 0x31016000 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -125,7 +125,7 @@ define amdgpu_kernel void @fptrunc( ; ; GISEL-LABEL: fptrunc: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -147,7 +147,7 @@ define amdgpu_kernel void @fptrunc( define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32.fabs: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -164,7 +164,7 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( ; ; GISEL-LABEL: v_pack_b32.fabs: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32.fneg: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -215,7 +215,7 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace( ; ; GISEL-LABEL: v_pack_b32.fneg: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 89fef7eead839..8579cbdf47137 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -89,7 +89,7 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg %src0ext, i32 inreg %src1ext) { ; SDAG-VI-LABEL: basic_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 @@ -104,7 +104,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -117,7 +117,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_med3_i16 v0, s2, 0, 0xff @@ -132,7 +132,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-VI-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -156,7 +156,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -413,13 +413,13 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> inreg %src) { ; SDAG-VI-LABEL: vec_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16 -; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 +; SDAG-VI-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-VI-NEXT: v_max_i16_e64 v1, s4, 0 +; SDAG-VI-NEXT: v_max_i16_e64 v2, s2, 0 ; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 ; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 @@ -430,24 +430,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; SDAG-GFX9-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; SDAG-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; SDAG-GFX9-NEXT: s_movk_i32 s0, 0xff +; SDAG-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; SDAG-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX9-NEXT: s_movk_i32 s2, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX9-NEXT: v_pk_max_i16 v1, s4, 0 -; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s0 op_sel_hi:[1,0] -; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s2 op_sel_hi:[1,0] +; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX9-NEXT: s_endpgm ; ; SDAG-GFX11-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s2, 0 +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s4, 0 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] ; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -457,24 +457,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-VI-LABEL: vec_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, 0 +; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_sext_i32_i16 s2, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: s_lshr_b32 s4, s2, 16 -; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 +; GISEL-VI-NEXT: s_lshr_b32 s3, s4, 16 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, s4 -; GISEL-VI-NEXT: s_max_i32 s2, s2, s3 -; GISEL-VI-NEXT: s_max_i32 s3, s4, s3 -; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff ; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-VI-NEXT: s_max_i32 s4, s4, s2 +; GISEL-VI-NEXT: s_max_i32 s2, s3, s2 +; GISEL-VI-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff ; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_min_i32 s3, s3, s4 ; GISEL-VI-NEXT: s_min_i32 s2, s2, s4 -; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 +; GISEL-VI-NEXT: s_min_i32 s3, s3, s4 ; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 -; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 -; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 +; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 +; GISEL-VI-NEXT: s_lshl_b32 s2, s2, 16 +; GISEL-VI-NEXT: s_or_b32 s2, s3, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -483,40 +483,40 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-GFX9-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 +; GISEL-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, 0 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s4 ; GISEL-GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GISEL-GFX9-NEXT: s_max_i32 s0, s1, s0 -; GISEL-GFX9-NEXT: s_max_i32 s1, s4, 0 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, s0 -; GISEL-GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GISEL-GFX9-NEXT: s_max_i32 s2, s3, s2 +; GISEL-GFX9-NEXT: s_max_i32 s3, s4, 0 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s2 +; GISEL-GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0xff00ff -; GISEL-GFX9-NEXT: s_min_i32 s1, s1, s4 -; GISEL-GFX9-NEXT: s_min_i32 s0, s0, 0xff -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s4 +; GISEL-GFX9-NEXT: s_min_i32 s2, s2, 0xff +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX9-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0 +; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, 0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 -; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 -; GISEL-GFX11-NEXT: s_max_i32 s3, s4, s3 -; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-GFX11-NEXT: s_ashr_i32 s4, s4, 16 +; GISEL-GFX11-NEXT: s_max_i32 s2, s3, s2 +; GISEL-GFX11-NEXT: s_max_i32 s3, s4, 0 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0xff00ff ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 ; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll index d5347f829002d..02a6024f858e9 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll @@ -25,7 +25,7 @@ bb: define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { ; GFX9-LABEL: test_sub_co_sdwa: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll index eb88c790dfe72..f0cbeba1cfb74 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -10,8 +10,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_dynelt_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 @@ -34,30 +34,27 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx0, i32 %idx1) #1 { ; GCN-LABEL: extract_insert_different_dynelt_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[10:11] ; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64 -; GCN-NEXT: s_load_dword s14, s[0:1], 0xf +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[0:1], s[10:11] +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 ; GCN-NEXT: s_cmp_eq_u32 s13, 3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s13, 2 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s13, 1 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s13, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s14, 1 ; GCN-NEXT: v_mov_b32_e32 v7, v5 -; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] @@ -87,8 +84,8 @@ define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(ptr addrspace(1 define amdgpu_kernel void @extract_insert_same_elt2_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_elt2_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[2:3], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 @@ -111,19 +108,19 @@ define amdgpu_kernel void @extract_insert_same_elt2_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in, float %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_dynelt_v4f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s8, s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] ; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 66c49ba8b734d..2797c5b798881 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1585,45 +1585,47 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { ; GFX9-LABEL: fma_shuffle_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fma_shuffle_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fma_shuffle_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 @@ -1713,7 +1715,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) % define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ptr addrspace(1) %out) { ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 @@ -1727,7 +1729,7 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; ; GFX10-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 @@ -1741,7 +1743,7 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; ; GFX11-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -4237,8 +4239,8 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { ; GFX9-LABEL: fma_shuffle_v2bf16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: s_mov_b32 s3, 0x7060302 @@ -4321,8 +4323,8 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-LABEL: fma_shuffle_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 @@ -4404,12 +4406,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-LABEL: fma_shuffle_v2bf16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[0:1] ; GFX11-NEXT: global_load_b64 v[2:3], v6, s[4:5] ; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4420,43 +4424,43 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_fmac_f32 v11, v12, v9 :: v_dual_and_b32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_fmac_f32_e32 v1, v12, v4 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_fmac_f32 v1, v12, v4 :: v_dual_lshlrev_b32 v8, 16, v2 ; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX11-NEXT: v_dual_fmac_f32 v7, v8, v9 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4 ; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_fmac_f32_e32 v4, v2, v5 +; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v9 +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4491,7 +4495,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo ; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 340f0cdd5d5d0..02da6deb96f1f 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -462,9 +462,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-LABEL: name: livevariables_update_missed_block ; SI: bb.0.entry: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) - ; SI-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; SI-NEXT: liveins: $vgpr0, $sgpr2_sgpr3 ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr0_sgpr1 + ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr2_sgpr3 ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 ; SI-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]](s32), implicit $exec ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -474,7 +474,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) - ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %48, 0, implicit $exec + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %54, 0, implicit $exec ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) @@ -502,14 +502,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.5.Flow: ; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %49:vgpr_32, %bb.6 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %55:vgpr_32, %bb.6 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %41:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 @@ -562,9 +562,9 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-LABEL: name: nested_waterfalls ; SI: bb.0.entry: ; SI-NEXT: successors: %bb.1(0x80000000) - ; SI-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; SI-NEXT: liveins: $vgpr0, $sgpr2_sgpr3 ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr0_sgpr1 + ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr2_sgpr3 ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.1.if.then: @@ -635,7 +635,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: bb.5: ; SI-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %22:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %28:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc ; SI-NEXT: SI_WATERFALL_LOOP %bb.4, implicit $exec ; SI-NEXT: {{ $}} @@ -648,7 +648,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] - ; SI-NEXT: GLOBAL_STORE_DWORD undef %25:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; SI-NEXT: GLOBAL_STORE_DWORD undef %31:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; SI-NEXT: S_ENDPGM 0 entry: %0 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 1937c57382092..6410df7f69e2a 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v3i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v0, v2, s[6:7] @@ -28,9 +28,9 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2 ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[2:3] offset:2 -; GFX906-NEXT: global_store_short v1, v4, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[0:1] offset:2 +; GFX906-NEXT: global_store_short v1, v4, s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -52,21 +52,21 @@ bb.2: define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v4i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dword v2, v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v2, v3, s[6:7] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v2, s[2:3] +; GFX906-NEXT: global_store_dword v1, v2, s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -88,8 +88,8 @@ bb.2: define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v5i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -97,16 +97,16 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: global_store_byte v3, v2, s[2:3] offset:4 -; GFX906-NEXT: global_store_dword v3, v1, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: global_store_byte v3, v2, s[0:1] offset:4 +; GFX906-NEXT: global_store_dword v3, v1, s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -128,21 +128,21 @@ bb.2: define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v8i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3] +; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -164,21 +164,21 @@ bb.2: define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v16i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[6:7] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[2:3] +; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -200,25 +200,25 @@ bb.2: define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0 ; GFX906-NEXT: v_mov_b32_e32 v9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[4:5] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[6:7] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[6:7] ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[2:3] offset:16 +; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[2:3] +; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -240,25 +240,25 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v61, 3, v0 -; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX906-NEXT: s_mov_b32 s10, -1 +; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:240 -; GFX906-NEXT: s_mov_b32 s11, 0xe00000 -; GFX906-NEXT: s_add_u32 s8, s8, s3 -; GFX906-NEXT: s_addc_u32 s9, s9, 0 +; GFX906-NEXT: s_mov_b32 s14, -1 +; GFX906-NEXT: s_mov_b32 s15, 0xe00000 +; GFX906-NEXT: s_add_u32 s12, s12, s9 +; GFX906-NEXT: s_addc_u32 s13, s13, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:224 ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[4:5] offset:208 @@ -280,11 +280,11 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] offset:240 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[6:7] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[6:7] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[6:7] offset:192 @@ -302,7 +302,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] ; GFX906-NEXT: .LBB6_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112 ; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:96 @@ -318,11 +318,11 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240 ; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] offset:224 @@ -353,9 +353,9 @@ bb.2: define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: repeat_successor: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s8, s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX906-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_cmp_lt_i32 s8, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 @@ -375,7 +375,7 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr ; GFX906-NEXT: .LBB7_5: ; %return.sink.split ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[2:3] +; GFX906-NEXT: global_store_dword v1, v0, s[0:1] ; GFX906-NEXT: .LBB7_6: ; %return ; GFX906-NEXT: s_endpgm entry: @@ -405,7 +405,7 @@ return: define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -460,7 +460,7 @@ bb.3: define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_zeroinit: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2 @@ -522,7 +522,7 @@ bb.3: define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_const: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: ; implicit-def: $vgpr3 @@ -631,7 +631,7 @@ bb.3: define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_multi_block: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -682,25 +682,25 @@ bb.3: define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_loop_carried: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX906-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0 ; GFX906-NEXT: s_mov_b32 s4, 0x2000604 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v1, v1, s[2:3] -; GFX906-NEXT: s_mov_b64 s[2:3], 0 +; GFX906-NEXT: global_load_dword v1, v1, s[0:1] +; GFX906-NEXT: s_mov_b64 s[0:1], 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v0, v1 ; GFX906-NEXT: .LBB12_1: ; %bb.1 ; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc -; GFX906-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; GFX906-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_cbranch_execnz .LBB12_1 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[0:1] @@ -728,9 +728,9 @@ bb.2: define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) { ; GFX906-LABEL: v8i8_multiuse_multiblock: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -874,13 +874,13 @@ bb.3: define amdgpu_kernel void @MissingInc_PhiChain(i1 %cmp, <16 x i8> %input) { ; GFX906-LABEL: MissingInc_PhiChain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX906-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX906-NEXT: s_mov_b32 s10, 1 ; GFX906-NEXT: v_mov_b32_e32 v4, 1 ; GFX906-NEXT: s_mov_b32 s11, 1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_bitcmp1_b32 s2, 0 +; GFX906-NEXT: s_bitcmp1_b32 s0, 0 ; GFX906-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX906-NEXT: s_xor_b64 s[0:1], s[2:3], -1 ; GFX906-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll index 1afe5cdea8723..d7db68a433319 100644 --- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll @@ -18,29 +18,29 @@ declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 i define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-LABEL: foo: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX906-NEXT: s_mov_b32 s10, -1 -; GFX906-NEXT: s_mov_b32 s11, 0xe00000 -; GFX906-NEXT: s_add_u32 s8, s8, s3 -; GFX906-NEXT: s_addc_u32 s9, s9, 0 -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:4 -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:8 -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:12 -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1c +; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s14, -1 +; GFX906-NEXT: s_mov_b32 s15, 0xe00000 +; GFX906-NEXT: s_add_u32 s12, s12, s9 +; GFX906-NEXT: s_addc_u32 s13, s13, 0 +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:4 +; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:8 +; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:12 +; GFX906-NEXT: s_load_dword s4, s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x1c +; GFX906-NEXT: s_mov_b64 s[2:3], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_bitcmp1_b32 s4, 0 -; GFX906-NEXT: s_mul_i32 s0, s2, s3 -; GFX906-NEXT: v_mul_u32_u24_e32 v1, s3, v1 +; GFX906-NEXT: s_mul_i32 s0, s0, s1 +; GFX906-NEXT: v_mul_u32_u24_e32 v1, s1, v1 ; GFX906-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX906-NEXT: v_add_lshl_u32 v2, v0, v2, 4 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_mov_b32 s4, 0 ; GFX906-NEXT: v_mov_b32_e32 v1, v0 ; GFX906-NEXT: s_cselect_b32 s5, 1, 0 -; GFX906-NEXT: s_mov_b64 s[2:3], exec ; GFX906-NEXT: ds_write_b64 v2, v[0:1] ; GFX906-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_waitcnt vmcnt(3) diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 901e88a4c6aca..dae46361b9bcc 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -20,7 +20,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -53,7 +53,7 @@ define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -101,7 +101,7 @@ define amdgpu_ps void @test_vopc_vcmp(float %x) { define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_2xf16: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_2xf16: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -138,25 +138,25 @@ define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 { ; GFX1032-LABEL: test_vopc_class: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_class_f32_e64 s0, s4, 0x204 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1032-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vopc_class: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 0x204 -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1064-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 0x204 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 @@ -169,27 +169,27 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 ; GFX1032-LABEL: test_vcmp_vcnd_f16: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, s4 ; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc_lo -; GFX1032-NEXT: global_store_short v1, v0, s[2:3] +; GFX1032-NEXT: global_store_short v1, v0, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vcmp_vcnd_f16: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: v_cmp_neq_f16_e64 vcc, 0x7c00, s4 ; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc -; GFX1064-NEXT: global_store_short v1, v0, s[2:3] +; GFX1064-NEXT: global_store_short v1, v0, s[0:1] ; GFX1064-NEXT: s_endpgm %cmp = fcmp oeq half %x, 0x7FF0000000000000 %sel = select i1 %cmp, half 1.0, half %x @@ -200,7 +200,7 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -214,7 +214,7 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -239,7 +239,7 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -253,7 +253,7 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -292,7 +292,7 @@ define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -318,10 +318,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_mask_if: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 10, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; %if -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dword v0, v0, s[0:1] @@ -331,10 +331,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1064-LABEL: test_mask_if: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 10, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; %if -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dword v0, v0, s[0:1] @@ -355,7 +355,7 @@ endif: define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_loop_with_if: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -417,7 +417,7 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_loop_with_if: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -516,42 +516,42 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-LABEL: test_loop_with_if_else_break: ; GFX1032: ; %bb.0: ; %bb ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: ; %bb.1: ; %.preheader -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s3, 0 -; GFX1032-NEXT: ; implicit-def: $sgpr4 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $sgpr3 ; GFX1032-NEXT: s_branch .LBB11_4 ; GFX1032-NEXT: .LBB11_2: ; %bb8 ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_add_i32 s3, s3, 1 +; GFX1032-NEXT: s_add_i32 s2, s2, 1 ; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] -; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1 +; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s2, v1 ; GFX1032-NEXT: s_add_u32 s0, s0, 4 ; GFX1032-NEXT: s_addc_u32 s1, s1, 0 -; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo ; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo -; GFX1032-NEXT: s_or_b32 s4, s4, s5 +; GFX1032-NEXT: s_or_b32 s3, s3, s5 ; GFX1032-NEXT: .LBB11_3: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4 -; GFX1032-NEXT: s_or_b32 s2, s5, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 s4, s5, s4 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: .LBB11_4: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v3, v2, s[0:1] -; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_or_b32 s3, s3, exec_lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3 ; GFX1032-NEXT: s_cbranch_vccz .LBB11_2 ; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: ; implicit-def: $sgpr3 +; GFX1032-NEXT: ; implicit-def: $sgpr2 ; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX1032-NEXT: s_branch .LBB11_3 ; GFX1032-NEXT: .LBB11_6: ; %.loopexit @@ -561,10 +561,10 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064: ; %bb.0: ; %bb ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_6 ; GFX1064-NEXT: ; %bb.1: ; %.preheader -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 @@ -631,7 +631,7 @@ bb8: define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_addc_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -643,7 +643,7 @@ define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 ; ; GFX1064-LABEL: test_addc_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -664,7 +664,7 @@ bb: define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subbrev_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -676,7 +676,7 @@ define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) ; ; GFX1064-LABEL: test_subbrev_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -697,7 +697,7 @@ bb: define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subb_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -709,7 +709,7 @@ define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 ; ; GFX1064-LABEL: test_subb_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -730,7 +730,7 @@ bb: define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_udiv64: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -892,7 +892,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_udiv64: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1063,7 +1063,7 @@ bb: define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1077,7 +1077,7 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX1064-LABEL: test_div_scale_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1104,31 +1104,33 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] +; GFX1032-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_scale_f64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] +; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -1186,8 +1188,8 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-LABEL: test_div_fmas_f32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s5 @@ -1195,14 +1197,14 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-NEXT: s_bitcmp1_b32 s7, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1032-NEXT: global_store_dword v2, v0, s[2:3] +; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_fmas_f32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s5 @@ -1210,7 +1212,7 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1064-NEXT: s_bitcmp1_b32 s7, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX1064-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1064-NEXT: global_store_dword v2, v0, s[2:3] +; GFX1064-NEXT: global_store_dword v2, v0, s[0:1] ; GFX1064-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 @@ -1221,14 +1223,14 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1032-LABEL: test_div_fmas_f64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: v_mov_b32_e32 v2, s10 ; GFX1032-NEXT: v_mov_b32_e32 v3, s11 -; GFX1032-NEXT: s_bitcmp1_b32 s2, 0 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -1238,14 +1240,14 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1064-LABEL: test_div_fmas_f64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s9 ; GFX1064-NEXT: v_mov_b32_e32 v2, s10 ; GFX1064-NEXT: v_mov_b32_e32 v3, s11 -; GFX1064-NEXT: s_bitcmp1_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX1064-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -1261,11 +1263,9 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 { ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1290,10 +1290,9 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 ; GFX1064-NEXT: s_mov_b64 vcc, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1344,7 +1343,7 @@ exit: define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX1032-LABEL: fdiv_f32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX1032-NEXT: v_rcp_f32_e32 v1, v0 @@ -1363,7 +1362,7 @@ define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) # ; ; GFX1064-LABEL: fdiv_f32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2 ; GFX1064-NEXT: v_rcp_f32_e32 v1, v0 @@ -1389,13 +1388,13 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1032-LABEL: test_br_cc_f16: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1032-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX1032-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v2 ; GFX1032-NEXT: s_cbranch_vccnz .LBB24_2 @@ -1409,13 +1408,13 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1064-LABEL: test_br_cc_f16: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1064-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX1064-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v2 ; GFX1064-NEXT: s_cbranch_vccnz .LBB24_2 @@ -1446,12 +1445,12 @@ two: define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 { ; GCN-LABEL: test_brcc_i1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN-NEXT: s_load_dword s0, s[2:3], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp0_b32 s2, 0 +; GCN-NEXT: s_bitcmp0_b32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB25_2 ; GCN-NEXT: ; %bb.1: ; %store -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0xde ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1473,14 +1472,14 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1032-LABEL: test_preserve_condition_undef_flag: ; GFX1032: ; %bb.0: ; %bb0 ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dword s1, s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s0, s2, 1.0 -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s3, 1.0 -; GFX1032-NEXT: v_cmp_ngt_f32_e64 s2, s2, 0 -; GFX1032-NEXT: s_or_b32 s0, s0, s1 -; GFX1032-NEXT: s_or_b32 s0, s0, s2 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s2, s0, 1.0 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s1, 1.0 +; GFX1032-NEXT: v_cmp_ngt_f32_e64 s0, s0, 0 +; GFX1032-NEXT: s_or_b32 s1, s2, s1 +; GFX1032-NEXT: s_or_b32 s0, s1, s0 ; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_vccnz .LBB26_2 ; GFX1032-NEXT: ; %bb.1: ; %bb1 @@ -1493,11 +1492,11 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1064-LABEL: test_preserve_condition_undef_flag: ; GFX1064: ; %bb.0: ; %bb0 ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dword s5, s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, 1.0 -; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, 1.0 +; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s5, 1.0 ; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[4:5], s4, 0 ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] @@ -1531,7 +1530,7 @@ bb2: define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX1032-NEXT: ; implicit-def: $sgpr1 ; GFX1032-NEXT: ; implicit-def: $sgpr2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1569,7 +1568,7 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; ; GFX1064-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x24 ; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX1064-NEXT: ; implicit-def: $sgpr4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1634,7 +1633,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -1649,7 +1648,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1672,29 +1671,29 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1032-LABEL: test_set_inactive: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v0, 42 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v0, 42 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1064-NEXT: s_endpgm %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) store i32 %tmp, ptr addrspace(1) %out @@ -1704,7 +1703,7 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 { ; GFX1032-LABEL: test_set_inactive_64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s2 @@ -1718,7 +1717,7 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; ; GFX1064-LABEL: test_set_inactive_64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s2 @@ -2138,7 +2137,7 @@ main_body: define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -2148,7 +2147,7 @@ define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, ; ; GFX1064-LABEL: test_intr_fcmp_i64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -2166,26 +2165,26 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 -; GFX1032-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] +; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_icmp_i64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4 -; GFX1064-NEXT: v_mov_b32_e32 v0, s0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s1 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) store i64 %result, ptr addrspace(1) %out @@ -2195,7 +2194,7 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -2205,7 +2204,7 @@ define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, ; ; GFX1064-LABEL: test_intr_fcmp_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -2222,25 +2221,25 @@ define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_icmp_i32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4 -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -2354,7 +2353,7 @@ define amdgpu_ps float @test_ps_live() #0 { define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2374,7 +2373,7 @@ define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, pt ; ; GFX1064-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2471,7 +2470,7 @@ main_body: define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2505,7 +2504,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2566,7 +2565,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-LABEL: fcmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2598,7 +2597,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; ; GFX1064-LABEL: fcmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 @@ -2658,7 +2657,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2692,7 +2691,7 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2752,7 +2751,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-LABEL: fcmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2784,7 +2783,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; ; GFX1064-LABEL: fcmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index e0b320aa4f372..978ac548443f7 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -22,7 +22,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i16_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,7 +36,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i16_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -59,7 +59,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load_zext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar ; ; VI-LABEL: widen_i16_constant_load_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -91,7 +91,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar ; ; GFX11-LABEL: widen_i16_constant_load_zext_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load_sext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -134,7 +134,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar ; ; VI-LABEL: widen_i16_constant_load_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,7 +149,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar ; ; GFX11-LABEL: widen_i16_constant_load_sext_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -175,7 +175,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i17_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -199,7 +199,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i17_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, 2 @@ -218,7 +218,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i17_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -247,7 +247,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_f16_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -263,7 +263,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_f16_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -275,7 +275,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_f16_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -296,7 +296,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_v2i8_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -317,7 +317,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_v2i8_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 44 ; VI-NEXT: v_mov_b32_e32 v1, 3 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -338,7 +338,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_v2i8_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -368,7 +368,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) %arg) { ; SI-LABEL: no_widen_i16_constant_divergent_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -387,7 +387,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; ; VI-LABEL: no_widen_i16_constant_divergent_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -404,7 +404,9 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; ; GFX11-LABEL: no_widen_i16_constant_divergent_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] @@ -431,7 +433,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i1_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -446,7 +448,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i1_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -459,7 +461,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i1_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -480,7 +482,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_zextload_i64_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -497,7 +499,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) ; ; VI-LABEL: widen_i16_zextload_i64_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -512,7 +514,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) ; ; GFX11-LABEL: widen_i16_zextload_i64_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -538,7 +540,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i1_zext_to_i64_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -556,7 +558,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; ; VI-LABEL: widen_i1_zext_to_i64_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -572,7 +574,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; ; GFX11-LABEL: widen_i1_zext_to_i64_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -596,7 +598,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; SI-LABEL: widen_i16_constant32_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -612,7 +614,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; ; VI-LABEL: widen_i16_constant32_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -627,7 +629,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; ; GFX11-LABEL: widen_i16_constant32_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -651,7 +653,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg) { ; SI-LABEL: widen_i16_global_invariant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -667,7 +669,7 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg ; ; VI-LABEL: widen_i16_global_invariant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -681,7 +683,7 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg ; ; GFX11-LABEL: widen_i16_global_invariant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index 40e4692a18ec7..d31c9e7e03e79 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; ; GFX9-LABEL: workgroup_id_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -17,7 +17,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; ; GFX12-LABEL: workgroup_id_x: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] @@ -33,24 +33,26 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry) { ; GFX9-LABEL: workgroup_id_xy: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_and_b32 s4, ttmp7, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: global_store_dword v2, v1, s[2:3] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v2, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xy: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, ttmp7 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_and_b32 s4, ttmp7, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v2, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -65,14 +67,14 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) { ; GFX9-LABEL: workgroup_id_xyz: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_and_b32 s6, ttmp7, 0xffff +; GFX9-NEXT: s_and_b32 s8, ttmp7, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_lshr_b32 s0, ttmp7, 16 ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -82,8 +84,8 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX12-LABEL: workgroup_id_xyz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 ; GFX12-NEXT: s_and_b32 s2, ttmp7, 0xffff ; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_lshr_b32 s3, ttmp7, 16 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index def51f2b16d3e..a74dbe1de0d39 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -438,33 +438,49 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; ; GFX9-O3-LABEL: call: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s14, -1 -; GFX9-O3-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 -; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s26, -1 +; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_getpc_b64 s[8:9] -; GFX9-O3-NEXT: s_add_u32 s8, s8, called@rel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s9, s9, called@rel32@hi+12 +; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O3-NEXT: s_mov_b32 s14, s8 +; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] +; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 +; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 +; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] +; GFX9-O3-NEXT: s_mov_b32 s12, s6 +; GFX9-O3-NEXT: s_mov_b32 s13, s7 +; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O3-NEXT: s_getpc_b64 s[22:23] +; GFX9-O3-NEXT: s_add_u32 s22, s22, called@rel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s23, s23, called@rel32@hi+12 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O3-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm @@ -689,42 +705,57 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; ; GFX9-O3-LABEL: call_i64: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s14, -1 -; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 -; GFX9-O3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s26, -1 +; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-O3-NEXT: s_getpc_b64 s[4:5] -; GFX9-O3-NEXT: s_add_u32 s4, s4, called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s5, s5, called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O3-NEXT: s_mov_b32 s14, s8 +; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] +; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 +; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-O3-NEXT: s_getpc_b64 s[2:3] +; GFX9-O3-NEXT: s_add_u32 s2, s2, called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s3, s3, called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 +; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 +; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] +; GFX9-O3-NEXT: s_mov_b32 s12, s6 +; GFX9-O3-NEXT: s_mov_b32 s13, s7 +; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4 +; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm @@ -1308,33 +1339,49 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; ; GFX9-O3-LABEL: strict_wwm_call: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s14, -1 -; GFX9-O3-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 -; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s26, -1 +; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_getpc_b64 s[8:9] -; GFX9-O3-NEXT: s_add_u32 s8, s8, strict_wwm_called@rel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s9, s9, strict_wwm_called@rel32@hi+12 +; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O3-NEXT: s_mov_b32 s14, s8 +; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] +; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 +; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 +; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] +; GFX9-O3-NEXT: s_mov_b32 s12, s6 +; GFX9-O3-NEXT: s_mov_b32 s13, s7 +; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O3-NEXT: s_getpc_b64 s[22:23] +; GFX9-O3-NEXT: s_add_u32 s22, s22, strict_wwm_called@rel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s23, s23, strict_wwm_called@rel32@hi+12 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O3-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm @@ -1559,42 +1606,57 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; ; GFX9-O3-LABEL: strict_wwm_call_i64: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s14, -1 -; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 -; GFX9-O3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s26, -1 +; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-O3-NEXT: s_getpc_b64 s[4:5] -; GFX9-O3-NEXT: s_add_u32 s4, s4, strict_wwm_called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s5, s5, strict_wwm_called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O3-NEXT: s_mov_b32 s14, s8 +; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] +; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 +; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-O3-NEXT: s_getpc_b64 s[2:3] +; GFX9-O3-NEXT: s_add_u32 s2, s2, strict_wwm_called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s3, s3, strict_wwm_called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 +; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 +; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] +; GFX9-O3-NEXT: s_mov_b32 s12, s6 +; GFX9-O3-NEXT: s_mov_b32 s13, s7 +; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4 +; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index e15fd7f29671a..9fac17f33d0d3 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -5,31 +5,31 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -54,33 +54,33 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v3, v7, v3 ; SI-NEXT: v_xor_b32_e32 v2, v6, v2 ; SI-NEXT: v_xor_b32_e32 v1, v5, v1 ; SI-NEXT: v_xor_b32_e32 v0, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -107,8 +107,8 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s2, s10 @@ -133,8 +133,8 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; ; VI-LABEL: xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -165,32 +165,32 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: v_xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -216,30 +216,30 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -263,7 +263,7 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: scalar_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -276,7 +276,7 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: scalar_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -292,8 +292,8 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: scalar_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -304,10 +304,10 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; ; VI-LABEL: scalar_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_not_b32 s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -321,7 +321,7 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -360,31 +360,31 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -409,8 +409,8 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -424,8 +424,8 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: scalar_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] @@ -442,7 +442,7 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; SI-LABEL: scalar_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -456,7 +456,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: scalar_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -473,7 +473,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -492,7 +492,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -514,7 +514,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) { ; SI-LABEL: xor_cf: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -545,7 +545,7 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; ; VI-LABEL: xor_cf: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -591,8 +591,8 @@ endif: define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -605,15 +605,15 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: scalar_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s3, s3, 0xf237b -; VI-NEXT: s_xor_b32 s2, s2, 0x3039 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_xor_b32 s1, s1, 0xf237b +; VI-NEXT: s_xor_b32 s0, s0, 0x3039 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 @@ -624,30 +624,30 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_literal_multi_use_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 ; SI-NEXT: s_movk_i32 s8, 0x3039 ; SI-NEXT: s_mov_b32 s9, 0xf237b -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: s_add_u32 s0, s2, 0x3039 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: s_addc_u32 s1, s3, 0xf237b +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_add_u32 s0, s6, 0x3039 +; SI-NEXT: s_addc_u32 s1, s7, 0xf237b ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_xor_literal_multi_use_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x3039 ; VI-NEXT: s_mov_b32 s3, 0xf237b ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -675,8 +675,8 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -688,14 +688,14 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: scalar_xor_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s2, 63 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_xor_b32 s0, s0, 63 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %or = xor i64 %a, 63 @@ -706,8 +706,8 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_neg_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -719,14 +719,14 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; ; VI-LABEL: scalar_xor_neg_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], -8 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm %or = xor i64 %a, -8 @@ -737,7 +737,7 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_i64_neg_inline_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -756,7 +756,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ; ; VI-LABEL: vector_xor_i64_neg_inline_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -777,7 +777,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -796,7 +796,7 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: vector_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll index f9a7e887ada23..28da8ac423107 100644 --- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: zext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -25,9 +25,9 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: zext_i16_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s6, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -47,8 +47,8 @@ define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @zext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: zext_i16_to_i32_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -68,8 +68,8 @@ define amdgpu_kernel void @zext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @zext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: zext_i16_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll index 2588d88b002b8..3b4ebef152967 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -525,6 +525,8 @@ } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2 + +attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } !0 = !{} diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll index 9939366e855c4..138106632c1bc 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -56,4 +56,4 @@ define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { ret void } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index 8922a233b1d8f..3f6f0c909e8bb 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -43,7 +43,7 @@ ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: body: -define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { +define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 { bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -66,5 +66,5 @@ bb4: ret void } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index 8326d95e0e7f2..b3ed7376a1ede 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -27,10 +27,16 @@ ; CHECK-NEXT: returnsVoid: true ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } +; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } +; CHECK-NEXT: dispatchID: { reg: '$sgpr8_sgpr9' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr10' } +; CHECK-NEXT: workGroupIDY: { reg: '$sgpr11' } +; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr12' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr13' } ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } +; CHECK-NEXT: workItemIDY: { reg: '$vgpr1' } +; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 ; CHECK-NEXT: mode: diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index f8d97c81698cd..d9c3c4b17090b 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -8,11 +8,11 @@ declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) # define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-LABEL: InferNothing: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s3, s2, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_ashr_i32 s1, s0, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s4 ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -34,12 +34,12 @@ entry: define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFadd: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s3, s2, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_ashr_i32 s1, s0, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -58,12 +58,12 @@ entry: define protected amdgpu_kernel void @InferFmax(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFmax: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s3, s2, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_ashr_i32 s1, s0, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -82,12 +82,12 @@ entry: define protected amdgpu_kernel void @InferFmin(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFmin: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s3, s2, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_ashr_i32 s1, s0, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -106,13 +106,13 @@ entry: define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) { ; CHECK-LABEL: InferMixed: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 -; CHECK-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s3, s2, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_ashr_i32 s1, s0, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; CHECK-NEXT: v_mov_b32_e32 v0, s8 ; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: s_add_u32 s0, s4, s0 @@ -140,11 +140,11 @@ bb1: define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferPHI: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s3, s2, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_ashr_i32 s1, s0, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: s_addc_u32 s1, s5, s1 ; CHECK-NEXT: s_add_u32 s2, s0, -8 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index 06a8a6fa04828..38b8ba12f0662 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -5,8 +5,8 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: ; CHECK: SelectionDAG has 25 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 -; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %1 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 +; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> ; CHECK-NEXT: t26: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t29: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 @@ -28,7 +28,7 @@ define i64 @i32_test(i32 %i) nounwind readnone { ; CHECK-LABEL: i32_test: ; CHECK: SelectionDAG has 15 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 ; CHECK-NEXT: t6: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t7: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t6, TargetConstant:i1<0> ; CHECK-NEXT: t14: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7 @@ -47,7 +47,7 @@ define i64 @i16_test(i16 %i) nounwind readnone { ; CHECK-LABEL: i16_test: ; CHECK: SelectionDAG has 18 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 ; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_USHORT_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0> ; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<65535> @@ -68,7 +68,7 @@ define i64 @i8_test(i8 %i) nounwind readnone { ; CHECK-LABEL: i8_test: ; CHECK: SelectionDAG has 18 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 ; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_UBYTE_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0> ; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<255>